In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from mlxtend.plotting import plot_decision_regions
from gensim.models import Word2Vec
In [2]:
!pip install numpy pandas matplotlib seaborn scikit-learn mlxtend gensim
Requirement already satisfied: numpy in d:\anaconda\lib\site-packages (1.23.5)
Requirement already satisfied: pandas in d:\anaconda\lib\site-packages (2.2.3)
Requirement already satisfied: matplotlib in d:\anaconda\lib\site-packages (3.9.2)
Requirement already satisfied: seaborn in d:\anaconda\lib\site-packages (0.13.2)
Requirement already satisfied: scikit-learn in d:\anaconda\lib\site-packages (1.5.2)
Requirement already satisfied: mlxtend in d:\anaconda\lib\site-packages (0.23.4)
Requirement already satisfied: gensim in d:\anaconda\lib\site-packages (4.3.3)
Requirement already satisfied: python-dateutil>=2.8.2 in d:\anaconda\lib\site-packages (from pandas) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in d:\anaconda\lib\site-packages (from pandas) (2024.1)
Requirement already satisfied: tzdata>=2022.7 in d:\anaconda\lib\site-packages (from pandas) (2023.3)
Requirement already satisfied: contourpy>=1.0.1 in d:\anaconda\lib\site-packages (from matplotlib) (1.3.1)
Requirement already satisfied: cycler>=0.10 in d:\anaconda\lib\site-packages (from matplotlib) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in d:\anaconda\lib\site-packages (from matplotlib) (4.51.0)
Requirement already satisfied: kiwisolver>=1.3.1 in d:\anaconda\lib\site-packages (from matplotlib) (1.4.4)
Requirement already satisfied: packaging>=20.0 in d:\anaconda\lib\site-packages (from matplotlib) (24.1)
Requirement already satisfied: pillow>=8 in d:\anaconda\lib\site-packages (from matplotlib) (11.0.0)
Requirement already satisfied: pyparsing>=2.3.1 in d:\anaconda\lib\site-packages (from matplotlib) (3.2.0)
Requirement already satisfied: scipy>=1.6.0 in d:\anaconda\lib\site-packages (from scikit-learn) (1.13.1)
Requirement already satisfied: joblib>=1.2.0 in d:\anaconda\lib\site-packages (from scikit-learn) (1.4.2)
Requirement already satisfied: threadpoolctl>=3.1.0 in d:\anaconda\lib\site-packages (from scikit-learn) (3.5.0)
Requirement already satisfied: smart-open>=1.8.1 in d:\anaconda\lib\site-packages (from gensim) (7.1.0)
Requirement already satisfied: six>=1.5 in d:\anaconda\lib\site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)
Requirement already satisfied: wrapt in d:\anaconda\lib\site-packages (from smart-open>=1.8.1->gensim) (1.14.1)
WARNING: Skipping D:\anaconda\Lib\site-packages\scipy-1.14.1.dist-info due to invalid metadata entry 'name'
WARNING: Skipping D:\anaconda\Lib\site-packages\scipy-1.14.1.dist-info due to invalid metadata entry 'name'
WARNING: Skipping D:\anaconda\Lib\site-packages\scipy-1.14.1.dist-info due to invalid metadata entry 'name'
WARNING: Skipping D:\anaconda\Lib\site-packages\scipy-1.14.1.dist-info due to invalid metadata entry 'name'
In [3]:
# Scatter and density plots
def plotScatterMatrix(df, plotSize, textSize):
    df = df.select_dtypes(include =[np.number]) # keep only numerical columns
    # Remove rows and columns that would lead to df being singular
    df = df.dropna(axis='columns')
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    columnNames = list(df)
    if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
        columnNames = columnNames[:10]
    df = df[columnNames]
    ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='hist')
    corrs = df.corr().values
    for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
        ax[i, j].annotate('%.1f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize)
    plt.suptitle('Таблица анализа данных, коэффициент корреляции')
    plt.show()
In [4]:
def decision_boundary_plot(X, y, X_train, y_train, clf, feature_indexes, title=None):
    feature1_name, feature2_name = X.columns[feature_indexes]
    X_feature_columns = X.values[:, feature_indexes]
    X_train_feature_columns = X_train[:, feature_indexes]
    clf.fit(X_train_feature_columns, y_train)

    plot_decision_regions(X=X_feature_columns, y=y.values, clf=clf)
    plt.xlabel(feature1_name)
    plt.ylabel(feature2_name)
    plt.title(title)
In [5]:
# загружаем полный датасет
raw_table_data = pd.read_csv('https://raw.githubusercontent.com/TAUforPython/BioMedAI/main/test_datasets/test_data_ECG.csv', nrows=5000)
# raw_table_data = pd.read_csv()
raw_table_data.head(10)
Out[5]:
subject_id Count_subj study_id cart_id Healthy_Status eeg_time eeg_date report_0 report_1 report_2 ... filtering rr_interval p_onset p_end qrs_onset qrs_end t_end p_axis qrs_axis t_axis
0 19557662 27 40000017 6848296 0 8:44 AM 27.06.2015 Sinus rhythm Possible right atrial abnormality NaN ... 60 Hz notch Baseline filter 659 40 128 170 258 518 81 77 79
1 18477137 93 40000029 6848296 0 9:54 AM 27.06.2015 Sinus rhythm Possible right atrial abnormality NaN ... 60 Hz notch Baseline filter 722 40 124 162 246 504 77 75 70
2 16598616 3 40000035 6376932 1 9:07 AM 28.06.2015 Sinus tachycardia NaN Normal ECG except for rate ... 60 Hz notch Baseline filter 600 40 130 162 244 474 79 72 77
3 16368287 7 40000079 6214760 1 5:14 PM 15.07.2015 Sinus rhythm NaN Normal ECG ... 60 Hz notch Baseline filter 659 40 146 180 254 538 79 66 69
4 18370366 2 40000084 6632385 0 1:52 PM 27.09.2015 Sinus rhythm NaN NaN ... <not specified> 659 368 29999 504 590 868 84 80 77
5 15606157 55 40000089 6632385 0 2:29 PM 29.10.2013 Sinus rhythm NaN NaN ... <not specified> 822 365 29999 499 592 852 26 46 30
6 12576058 43 40000115 6852956 1 12:54 PM 23.03.2016 Sinus rhythm NaN Normal ECG ... 60 Hz notch Baseline filter 952 40 146 198 282 598 24 80 20
7 14691089 1 40000143 6551957 0 10:01 AM 10.12.2016 Sinus rhythm rSr'(V1) - probable normal variant Low QRS voltages in precordial leads ... 60 Hz notch Baseline filter 923 40 140 188 278 594 26 86 13
8 14144725 7 40000144 6924910 0 7:24 AM 11.12.2011 Sinus rhythm with PAC(s). NaN Borderline ECG ... 60 Hz notch Baseline filter 952 40 180 196 294 610 59 -17 3
9 16089780 2 40000152 6919786 0 12:35 PM 13.12.2011 Sinus rhythm Extensive T wave changes may be due to myocard... NaN ... 60 Hz notch Baseline filter 1000 40 156 178 274 584 8 -11 19

10 rows × 36 columns

In [6]:
# Предобрабатываем данные словестных репортов word2vec в числовой вектор
# 这段代码通过去除异常值、合并和清理文本报告数据,使用 Word2Vec 模型将文本转化为数值表示,并将其作为新的特征用于后续分析。

# Убираем выбросы数据过滤,过滤掉所有在 columns_to_filter 列中具有大于 2000 的值的行,
# 筛选出 p_onset 小于 p_end 且 qrs_onset 小于 qrs_end 的行。这是为了去除数据中的异常值(如时间顺序错误等)
columns_to_filter = ['rr_interval', 'p_onset', 'p_end', 'qrs_onset', 'qrs_end', 't_end', 'p_axis', 'qrs_axis', 't_axis']
full_df_filtered = raw_table_data[(raw_table_data[columns_to_filter] < 2000).all(axis=1)]
full_df_filtered = full_df_filtered[(full_df_filtered['p_onset'] < full_df_filtered['p_end']) & (full_df_filtered['qrs_onset'] < full_df_filtered['qrs_end'])]

# Слепляем все текстовые отчеты в один большой文本数据预处理,
# 将 report_0 到 report_17 列中的文本数据合并成一个单一的文本列 report。每个报告通过空格连接。接着,去除字符串中的 nan 和多余的空格,确保文本格式清洁。
reports = [f'report_{x}' for x in range(18)]
full_df_filtered['report_0'] = full_df_filtered[reports].astype(str).agg(' '.join, axis=1)
full_df_filtered['report_0'] = full_df_filtered['report_0'].str.replace(r'\bnan\b', '', regex=True).str.replace(r'\s+', ' ', regex=True).str.strip()
full_df_filtered.rename(columns={'report_0': 'report'}, inplace=True)
reports_to_drop = [f'report_{x}' for x in range(1, 18)]
full_df_filtered = full_df_filtered.drop(reports_to_drop, axis=1)

# Фиксим имена столбцов 修正列名字;删除了 bandwidth 和 filtering 列(无关列)
full_df_filtered = full_df_filtered.rename(columns={'eeg_time ': 'eeg_time', 'eeg_date ': 'eeg_date'})
full_df_filtered = full_df_filtered.drop(columns = ['bandwidth', 'filtering'])

# Делаем колонку с таргетами крайней справа将 Healthy_Status 列移到数据框的最右侧
full_df_filtered = full_df_filtered[[col for col in full_df_filtered.columns if col != 'Healthy_Status'] + ['Healthy_Status']]

# Разбиваем столбец с текстовым отчетом на токены分词,将每个报告report的文本按空格进行分割,转换成词语列表
words = [text.split() for text in full_df_filtered['report']]

# Обучаем модель Word2Vec使用 Word2Vec 模型训练分词后的文本数据。这个模型会将每个词转换为一个向量。
w2v_model = Word2Vec(words)

# Функция для получения среднего эмбеддинга строки
# 定义了一个函数 get_sentence_embedding,用于计算每个报告的句子嵌入向量。首先将报告拆分成单词,然后从训练好的 Word2Vec 模型中提取每个单词的向量。
# 如果该单词有对应的向量,则返回所有词向量的均值;如果没有,则返回一个全零的向量。
# 之后,通过 .apply 方法将该函数应用到所有报告上,并将每个报告的嵌入向量的均值作为该报告的数值表示。
def get_sentence_embedding(sentence):
    words = sentence.split()# 将输入的 sentence(句子)按空格拆分成一个个单词
    word_vectors = [w2v_model.wv[word] for word in words if word in w2v_model.wv] 
    # 对于句子中的每个单词,代码尝试从训练好的 Word2Vec 模型 w2v_model 中查找该单词的词向量
    # 如果该单词在 Word2Vec 模型的词汇表中(word in w2v_model.wv),则提取对应的词向量;如果不在词汇表中,则跳过该单词。
    if word_vectors:
        return np.mean(word_vectors, axis=0)  # Усреднённый вектор
    #如果该句子中至少有一个词向量(即 word_vectors 列表非空),则通过 np.mean(word_vectors, axis=0) 计算所有单词词向量的均值。
    # 这意味着句子的嵌入向量是句子中所有单词的向量的平均值。这样可以代表整个句子的含义。
    else:
        return np.zeros(w2v_model.vector_size)  # Нулевой вектор, если слова нет в модели
    # 如果句子中的单词都不在词汇表中,word_vectors 列表为空,则返回一个全零向量(np.zeros(w2v_model.vector_size)),即用一个默认的零向量表示这个句子。

# Применяем функцию ко всей колонке и переводим каждый вектор в одно число
# 查看预处理后的数据: 显示数据框的前几行,以检查预处理的结果。
full_df_filtered['report'] = full_df_filtered['report'].apply(lambda x: get_sentence_embedding(x).mean())

full_df_filtered.head()
     
Out[6]:
subject_id Count_subj study_id cart_id eeg_time eeg_date report rr_interval p_onset p_end qrs_onset qrs_end t_end p_axis qrs_axis t_axis Healthy_Status
0 19557662 27 40000017 6848296 8:44 AM 27.06.2015 0.012239 659 40 128 170 258 518 81 77 79 0
1 18477137 93 40000029 6848296 9:54 AM 27.06.2015 0.012239 722 40 124 162 246 504 77 75 70 0
2 16598616 3 40000035 6376932 9:07 AM 28.06.2015 0.035913 600 40 130 162 244 474 79 72 77 1
3 16368287 7 40000079 6214760 5:14 PM 15.07.2015 0.022903 659 40 146 180 254 538 79 66 69 1
6 12576058 43 40000115 6852956 12:54 PM 23.03.2016 0.022903 952 40 146 198 282 598 24 80 20 1
In [7]:
# make new dataset from old columns
table_data = full_df_filtered[['report','rr_interval','p_end','qrs_onset','qrs_end','t_end','p_axis','qrs_axis','t_axis','Healthy_Status']].copy()
table_data.head()
Out[7]:
report rr_interval p_end qrs_onset qrs_end t_end p_axis qrs_axis t_axis Healthy_Status
0 0.012239 659 128 170 258 518 81 77 79 0
1 0.012239 722 124 162 246 504 77 75 70 0
2 0.035913 600 130 162 244 474 79 72 77 1
3 0.022903 659 146 180 254 538 79 66 69 1
6 0.022903 952 146 198 282 598 24 80 20 1
In [8]:
# 绘制数据框(table_data)中每一列的箱形图(boxplot),用于可视化每个特征的数据分布情况。
n = table_data.shape[1]
fig,ax = plt.subplots(1,n, figsize=(n*2,4), sharex=True)
for i in range(n):
    plt.sca(ax[i])
    col = table_data.columns[i]
    sns.boxplot(y = table_data.iloc[:,i],data=table_data, medianprops={"color": "r", "linewidth": 2})
No description has been provided for this image
In [9]:
# удаляем аномалии из данных (убрать все значения более 5000)
#table_data = table_data.drop(table_data[table_data['qrs_axis'] > 5000].index)
for i, v in enumerate(table_data):
  table_data = table_data.drop(table_data[table_data.iloc[:,i] > 10000].index)
In [10]:
#绘制新的箱型图
n = table_data.shape[1]
fig,ax = plt.subplots(1,n, figsize=(n*2,4), sharex=True)
for i in range(n):
    plt.sca(ax[i]) #每个子图共享 X 轴,便于比较不同列的数据分布
    col = table_data.columns[i] # 获取当前列的名称
    sns.boxplot(y = table_data.iloc[:,i],data=table_data,
                medianprops={"color": "r", "linewidth": 2})#绘制箱形图
    plt.tight_layout# 调整子图的布局,避免子图之间的重叠,确保每个子图都能清晰地显示出来。这个函数会自动调整子图之间的间距,使得图形看起来更加整洁。
No description has been provided for this image
In [11]:
sns.countplot(data=table_data, x="Healthy_Status") #使用sns绘制条形图
plt.title("Distribution of Healthy Status")
plt.show()
No description has been provided for this image
In [12]:
#example view 绘制 table_data 数据框中的 rr_interval 列的数据
plt.plot(table_data['rr_interval'],'.')
Out[12]:
[<matplotlib.lines.Line2D at 0x2f90fd29b90>]
No description has been provided for this image
In [13]:
# 这段代码用于绘制一个热力图(heatmap),展示 table_data 数据框中各列之间的相关性。
sns.heatmap(table_data.corr(),annot=True,fmt="0.2f",cmap="coolwarm")#设置显示的数值格式为保留两位小数;指定热力图的配色方案
plt.show()
No description has been provided for this image
In [14]:
# 调用了之前讨论的 plotScatterMatrix 函数,并传递了 table_data 数据框以及绘图的尺寸参数
plotScatterMatrix(table_data, 7, 10)
# plotScatterMatrix 函数的功能:
# 选择数值型列:plotScatterMatrix 函数首先会选择数据框中所有数值型的列进行绘制。
# 去除缺失值和常量列:函数会去除包含缺失值的列,并保留那些至少有两个不同值的列。这样可以避免在绘制时出现奇异的结果。
# 绘制散点矩阵:接着,函数会使用 pandas.plotting.scatter_matrix 绘制散点矩阵图。每对变量之间都会有一个散点图,主对角线则是该变量的直方图。
# 计算并显示相关性:在每个非对角线的散点图上,函数会计算并显示每对变量之间的相关性系数,帮助你观察不同特征之间的线性关系。
No description has been provided for this image
In [15]:
fig,ax = plt.subplots(figsize=(15,5))# 创建一个包含子图(ax)的图形对象(fig)
#sns.lineplot(x='age',y='debtinc',data=table_data,ax=ax)
sns.lineplot(x='rr_interval',y='t_end',data=table_data,ax=ax) #h绘制折线图(两者关系)
Out[15]:
<Axes: xlabel='rr_interval', ylabel='t_end'>
No description has been provided for this image
In [16]:
# 使用 Seaborn 绘制了一组散点图矩阵(pairplot),其中的每一对变量之间的关系被展示为散点图。
# 它还使用 Healthy_Status 列作为类别来着色,帮助观察不同健康状态类别之间的分布差异。
#labels = pd.DataFrame(table_data['Healthy_Status'])
sns.pairplot(table_data, hue = "Healthy_Status", #Healthy_Status 列的不同值会用不同的颜色显示,帮助我们区分不同健康状态类别的数据点。
             height=1.5, # 设置每个子图的高度为 1.5 英寸。这里设置较小的 height 是为了调整图形的紧凑性,适合小尺寸的数据集。如果数据集较大,可以增大这个值。
             plot_kws=dict(alpha=0.3))# alpha=0.3 表示散点图中的数据点的透明度设置为 30%。
Out[16]:
<seaborn.axisgrid.PairGrid at 0x2f90fcbe1d0>
No description has been provided for this image
In [17]:
# 使用 Seaborn 绘制了一个 散点图(stripplot),该图展示了 rr_interval 和 qrs_end 这两个变量之间的关系,并且使用 Healthy_Status 列来给数据点着色
sns.stripplot(x=table_data['rr_interval'],y=table_data['qrs_end'],
              hue=table_data['Healthy_Status'],jitter=True)
Out[17]:
<Axes: xlabel='rr_interval', ylabel='qrs_end'>
No description has been provided for this image
In [18]:
# 使用 seaborn 库的 swarmplot 函数绘制了一个 蜂群图,展示了 rr_interval 与 Healthy_Status 之间的关系
sns.swarmplot(data=table_data, x="rr_interval", hue="Healthy_Status")
Out[18]:
<Axes: xlabel='rr_interval'>
D:\anaconda\Lib\site-packages\seaborn\categorical.py:3399: UserWarning: 58.1% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
No description has been provided for this image
In [20]:
# 使用 Seaborn 的 violinplot 函数绘制了 rr_interval 在不同健康状态下的分布情况。
# 小提琴图是一种综合了箱形图和密度图的图表,展示了数据的分布、集中趋势和变异性。
plt.figure(figsize=(14, 6))
sns.violinplot(data=table_data, y='rr_interval', x='Healthy_Status')
plt.title('rr_interval Distribution by Medical Condition')
plt.show()
No description has been provided for this image
In [21]:
# 以上为数据预处理部分+可视化部分
In [22]:
# 以下为PCA分析和t-SNE分析
In [23]:
# PCA主成分分析前的准备工作
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
In [24]:
#table_data_pca = full_df_filtered #将 full_df_filtered 数据框赋值给 table_data_pca。
table_data_pca = table_data
#full_df_filtered 是经过处理的原始数据,而 table_data_pca 是将用于后续PCA分析的目标数据框
table_data_pca = table_data_pca.drop('Healthy_Status', axis = 1)
# 删除数据框 table_data_pca 中的 Healthy_Status 列。
# 因为 PCA 是一种无监督的降维技术,通常仅用于数值特征数据,因此需要去除目标列(这里的 Healthy_Status 列)进行降维
table_data_pca.columns # 返回 table_data_pca 数据框中剩余的列名
Out[24]:
Index(['report', 'rr_interval', 'p_end', 'qrs_onset', 'qrs_end', 't_end',
       'p_axis', 'qrs_axis', 't_axis'],
      dtype='object')
In [25]:
#cancer = load_breast_cancer()
# Before applying PCA, each feature should be centered (zero mean) and with unit variance
scaled_data = StandardScaler().fit(table_data_pca).transform(table_data_pca)# 将数据按列进行标准化处理,使得每列的均值为 0,标准差为 1。
pca = PCA(n_components = 2).fit(scaled_data) # 创建了一个 PCA 对象,指定将数据降到 2 维;即希望通过 PCA 提取前两个主成分(即二维的降维)
# PCA(copy=True, n_components=2, whiten=False)
x_pca = pca.transform(scaled_data)
#将标准化后的数据 scaled_data 转换到主成分空间。通过这个步骤,数据会被投影到新的坐标系中,其中第一主成分和第二主成分是主要的轴
print(table_data.shape, x_pca.shape)
(3448, 10) (3448, 2)
In [26]:
percent = pca.explained_variance_ratio_# 这是 PCA 模型的一个属性,表示每个主成分对数据方差的贡献比例
print(percent)
print(sum(percent))
#To see how much variance is preserved for each dataset.
# 结果中第一个主成分解释了34%的方差,第二个主成分解释了13%的方差,两个主成分共同保留了48%的数据方差
[0.34425404 0.13828051]
0.48253454649422284
In [27]:
# 这段代码定义了一个名为 pca_explained 的函数,用于计算并显示每个主成分在主成分分析(PCA)中所解释的方差比例,直到累计方差解释比例超过指定的阈值。
def pca_explained(X, threshold):
  features = X.shape[1]
  for i in range(2, features):
    pca = PCA(n_components = i).fit(X)
    sum_ = pca.explained_variance_ratio_
    # add all components explained variances
    percent = sum(sum_)
    print('{} components at {:.2f}% explained variance'.format(i,percent*100))
    if percent > threshold:
      break

pca_explained(scaled_data, 0.85)
# 结果含义:
# 使用前 2 个主成分时,能够解释约 48.12% 的方差。
# 使用前 3 个主成分时,能够解释约 61.17%的方差。
# 使用前 4 个主成分时,能够解释约 72.99% 的方差,
# 使用前 5 个主成分时,能够解释约 82.25% 的方差,
# 使用前 6 个主成分时,能够解释约 90.23% 的方差,超过阈值,停止计算
2 components at 48.25% explained variance
3 components at 61.27% explained variance
4 components at 73.08% explained variance
5 components at 82.27% explained variance
6 components at 90.26% explained variance
In [28]:
# 将通过 PCA 得到的前两个主成分绘制为散点图,并根据 Healthy_Status 列的数据进行着色,以显示健康状态类别之间的分布情况
plt.figure(figsize=(8,6)) # 创建一个图形,并设置图形的大小为 8 英寸宽,6 英寸高
plt.scatter(x_pca[:,0], x_pca[:,1], c=full_df_filtered['Healthy_Status'], cmap='plasma', alpha=0.4, edgecolors='black', s=65);
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
Out[28]:
Text(0, 0.5, 'Second Principal Component')
No description has been provided for this image
In [29]:
# 首先将 PCA 的成分值(即主成分的载荷)放入一个 DataFrame,并绘制了一个散点图,展示了降维后的数据。
# 在散点图的基础上,还展示了每个特征在主成分空间中的投影,利用箭头和标签指示每个特征如何在主成分上进行投影。
# put feature values into dataframe
components = pd.DataFrame(pca.components_.T, index=table_data_pca.columns, columns= ['PCA1','PCA2'])

# 绘制散点图
# plot size
plt.figure(figsize=(10,8))
# main scatterplot
plt.scatter(x_pca[:,0], x_pca[:,1], c=full_df_filtered['Healthy_Status'],
            cmap='plasma', alpha=0.4, edgecolors='black', s=40);
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
#plt.ylim(15,-15);
#plt.xlim(20,-20);

# individual feature values添加第二个坐标轴
ax2 = plt.twinx().twiny();
#ax2.set_ylim(-0.5,0.5);
#ax2.set_xlim(-0.5,0.5);

# reference lines设置参考线;虚线用于在主成分空间中表示零点,帮助判断特征在主成分方向上的分布
ax2.hlines(0,-0.5,0.5, linestyles='dotted', colors='grey')
ax2.vlines(0,-0.5,0.5, linestyles='dotted', colors='grey')

# offset for labels绘制箭头和文本;箭头的方向和长度表示该特征在各主成分上的贡献
offset = 0.95
# arrow & text
for a, i in enumerate(components.index):
  ax2.arrow(0, 0, components['PCA1'][a], -components['PCA2'][a], alpha=0.5, facecolor='white', head_width=.01)
  ax2.annotate(i, (components['PCA1'][a]*offset, -components['PCA2'][a]*offset), color='orange')
     
C:\Users\25150\AppData\Local\Temp\ipykernel_9032\1802625492.py:30: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  ax2.arrow(0, 0, components['PCA1'][a], -components['PCA2'][a], alpha=0.5, facecolor='white', head_width=.01)
C:\Users\25150\AppData\Local\Temp\ipykernel_9032\1802625492.py:31: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  ax2.annotate(i, (components['PCA1'][a]*offset, -components['PCA2'][a]*offset), color='orange')
No description has been provided for this image
In [30]:
# 可视化 PCA(主成分分析)中每个主成分的成分矩阵(载荷矩阵)。
# 使用 imshow 函数生成一个热图,以展示特征在主成分上的权重(即每个特征对主成分的贡献程度)
fig = plt.figure(figsize=(8, 4))#创建图形和设置大小
plt.imshow(pca.components_, interpolation = 'none', cmap = 'plasma')#绘制热图
feature_names = list(table_data_pca.columns)# 设置坐标轴
plt.gca().set_xticks(np.arange(-.5, len(feature_names)-1));
plt.gca().set_yticks(np.arange(0.5, 2));
plt.gca().set_xticklabels(feature_names, rotation=90, ha='left',fontsize=12);#设置刻度标签
plt.gca().set_yticklabels(['First PC', 'Second PC'], va='bottom',fontsize=12);
plt.colorbar(orientation='horizontal', ticks=[pca.components_.min(), 0,
                                              pca.components_.max()],pad=0.65);#添加颜色条
No description has been provided for this image
In [31]:
# 以上为PCA主成分分析法
In [32]:
# 以下为t-SNE方法
In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
In [34]:
# 将目标列 'Healthy_Status' 排除,假设你已经准备好了数据集
#X = full_df_filtered.drop(columns=['Healthy_Status'])  # 去除目标变量列
#y = full_df_filtered['Healthy_Status']  # 目标变量
X = table_data.drop(columns=['Healthy_Status'])  # 去除目标变量列
y = table_data['Healthy_Status']  # 目标变量

# 对特征数据进行标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 使用t-SNE进行降维,将数据降到2维
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_scaled)

# 创建 DataFrame 存储 t-SNE 降维后的数据
tsne_df = pd.DataFrame(X_tsne, columns=['TSNE1', 'TSNE2'])
tsne_df['Healthy_Status'] = y

# 绘制 t-SNE 结果
plt.figure(figsize=(10, 8))
scatter = plt.scatter(tsne_df['TSNE1'], tsne_df['TSNE2'], c=tsne_df['Healthy_Status'], cmap='plasma', alpha=0.6, edgecolors='k', s=50)

# 添加颜色条
plt.colorbar(scatter)
plt.title('t-SNE visualization of the dataset')
plt.xlabel('TSNE1')
plt.ylabel('TSNE2')
plt.show()
No description has been provided for this image
In [35]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

# 假设 table_data 已经加载并进行了预处理
# 提取特征数据和目标变量
X = table_data.drop(columns=['Healthy_Status'])  # 去掉目标变量 'Healthy_Status'
y = table_data['Healthy_Status']  # 提取目标变量 'Healthy_Status'

# 对特征数据进行标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 创建 t-SNE 实例,并将数据降维至 3 维
tsne = TSNE(n_components=3, random_state=42)
X_tsne = tsne.fit_transform(X_scaled)

# 创建 DataFrame 存储 t-SNE 降维后的数据
tsne_df = pd.DataFrame(X_tsne, columns=['TSNE1', 'TSNE2', 'TSNE3'])  # 三个主成分
tsne_df['Healthy_Status'] = y  # 添加目标变量

# 绘制三维 t-SNE 结果
fig = plt.figure(figsize=(12, 10))
ax = fig.add_subplot(111, projection='3d')  # 创建三维坐标轴

# 绘制散点图
scatter = ax.scatter(tsne_df['TSNE1'], tsne_df['TSNE2'], tsne_df['TSNE3'], c=tsne_df['Healthy_Status'], cmap='plasma', alpha=0.6, edgecolors='k', s=50)

# 设置标题和标签
ax.set_title("3D t-SNE Visualization of the Dataset")
ax.set_xlabel("t-SNE Component 1")
ax.set_ylabel("t-SNE Component 2")
ax.set_zlabel("t-SNE Component 3")

# 添加颜色条
fig.colorbar(scatter)

# 显示图表
plt.show()
No description has been provided for this image
In [36]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs

# 生成样本数据
#X, y = make_blobs(n_samples=300, centers=4, cluster_std=1.0, random_state=42)
X = table_data.drop(columns=['Healthy_Status'])  # 去除目标变量列
y = table_data['Healthy_Status']  # 目标变量

# 可视化生成的数据
plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y, s=30, cmap='viridis')
plt.title("Original Dataset")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.show()
No description has been provided for this image
In [37]:
from sklearn.manifold import TSNE

# 创建 t-SNE 实例,并将数据降维至 2 维
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X)

# 可视化降维后的数据
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y, s=30, cmap='viridis')
plt.title("t-SNE Reduced Data")
plt.xlabel("t-SNE Component 1")
plt.ylabel("t-SNE Component 2")
plt.show()
No description has been provided for this image
In [38]:
#---------------------以下为lab3--------------------------
In [39]:
#-----ML: GussianClassifier + описание классов и матрица ошибок---------------
# 高斯分类器+类别描述+错误矩阵/混淆矩阵
In [40]:
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix
In [41]:
table_data.columns
Out[41]:
Index(['report', 'rr_interval', 'p_end', 'qrs_onset', 'qrs_end', 't_end',
       'p_axis', 'qrs_axis', 't_axis', 'Healthy_Status'],
      dtype='object')
In [42]:
# set the classification index of table
clf_index = 9
# 设置分类目标列的索引,第9列是你要分类的类别列(target class)
In [43]:
#类别描述
plt.figure(figsize=(35,20),dpi=90)
# Plot frequency percentages barplot
table_data[table_data.columns[clf_index]].value_counts(normalize=True).mul(100).plot(kind='barh', width=0.8, figsize=(8,5))

# Add frequency percentages to the plot
labels = table_data[table_data.columns[clf_index]].value_counts(normalize=True).mul(100).round(1)
for i in labels.index:
    plt.text(labels[i], i, str(labels[i])+ '%', fontsize=15, weight='bold')

plt.xlim([0, 110])
plt.xlabel('Frequency Percentage', fontsize=13)
plt.ylabel(table_data.columns[clf_index], fontsize=13)
plt.title('Frequency Percentage of Target Classes', fontsize=13)
plt.show()
No description has been provided for this image
In [44]:
#提取分类目标(y),提取输入特征(X),并把数据集划分为训练集和测试集(70%训练,30%测试)
#同时保持类别比例一致(stratify)。
# classification variable 从表格中取出第 clf_index 列,作为目标分类变量(y
y1 = table_data.iloc[:, clf_index]
# input variable  删除掉第 clf_index 列(目标变量那一列),剩下的全部作为输入特征(X)
X1 = table_data.drop(table_data.columns[[clf_index]], axis=1).iloc[:,:]
# 用 LabelEncoder 对目标变量 y1 进行编码(将字符串类别转换为数字,如:A→0,B→1)
# 并转成 pandas 的 Series 类型。
y1 = pd.Series(LabelEncoder().fit_transform(y1))
# 这行代码的作用是将数据随机分为训练集和测试集
X1_train, X1_test, y1_train, y1_test = train_test_split(X1.values,
                                                        y1.values,
                                                        test_size=0.3,
                                                        random_state=0,
                                                        stratify=y1.values)
In [45]:
# 各部分的形状信息(即样本数量和特征维度)
X_train,X_test,y_train,y_test = train_test_split(X1.values,
                                                 y1.values,
                                                 test_size=0.3,
                                                 random_state=41,
                                                 stratify=y1.values)
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)
X_train shape: (2413, 9)
y_train shape: (2413,)
X_test shape: (1035, 9)
y_test shape: (1035,)
In [46]:
#使用 GaussianNB(高斯朴素贝叶斯分类器)进行训练和预测,并评估分类效果
GussianClassifier = GaussianNB()
GussianClassifier.fit(X1_train,y1_train)
y_pred=GussianClassifier.predict(X_test)
print("===================================> Result <===================================")
print("Accuracy                     = " ,metrics.accuracy_score(y_test,y_pred))
print("F1 Score                     = " ,metrics.f1_score(y_test,y_pred))
===================================> Result <===================================
Accuracy                     =  0.927536231884058
F1 Score                     =  0.8803827751196173
In [47]:
# 通过混淆矩阵评估分类效果
from sklearn.metrics import ConfusionMatrixDisplay
In [48]:
# 分类模型评估流程
target_names = ['True', 'False']
labels_names = [0,1]
print(classification_report(y_test, y_pred,labels=labels_names, target_names=target_names))
# 生成分类报告
#cm = confusion_matrix(y_test, y_pred,labels=labels_names,normalize='true')
cm = confusion_matrix(y_test, y_pred,labels=labels_names)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=target_names)
disp = disp.plot(cmap=plt.cm.Blues,values_format='g')
plt.show()
# 绘制混淆矩阵
              precision    recall  f1-score   support

        True       0.98      0.92      0.95       747
       False       0.81      0.96      0.88       288

    accuracy                           0.93      1035
   macro avg       0.90      0.94      0.91      1035
weighted avg       0.94      0.93      0.93      1035

No description has been provided for this image
In [49]:
cm = confusion_matrix(y_test,GussianClassifier.predict(X_test))

fig , ax  = plt.subplots(figsize=(4,4))
ax.imshow(cm, cmap = 'plasma')
ax.grid(False)
ax.xaxis.set(ticks=(0,1),ticklabels=("Predicted as True","Predicted as False"))
ax.yaxis.set(ticks=(0,1),ticklabels=("Actual as True","Actual as False"))
ax.set_ylim(1.5,-0.5)
for i in range(2):
    for j in range(2):
        ax.text(j,i,cm[i,j],ha="center",va="center",color = "red")
No description has been provided for this image
In [50]:
# --------------举例 AutoML----------------

# Убираем из датасета лишние столбцы
# Формируем датасет для AutoML методов
autoML_df = full_df_filtered[['report', 'rr_interval', 'p_end', 'qrs_onset', 'qrs_end', 't_end', 'p_axis', 'qrs_axis', 't_axis', 'Healthy_Status']].copy()
In [51]:
# AutoML H2O 使用的标准启动流程 
# 安装 H2O Python 包(需要联网且在支持 Jupyter / Colab / 本地环境下运行)
!pip install h2o
Requirement already satisfied: h2o in d:\anaconda\lib\site-packages (3.46.0.6)
Requirement already satisfied: requests in d:\anaconda\lib\site-packages (from h2o) (2.32.3)
Requirement already satisfied: tabulate in d:\anaconda\lib\site-packages (from h2o) (0.9.0)
Requirement already satisfied: charset-normalizer<4,>=2 in d:\anaconda\lib\site-packages (from requests->h2o) (3.3.2)
Requirement already satisfied: idna<4,>=2.5 in d:\anaconda\lib\site-packages (from requests->h2o) (3.7)
Requirement already satisfied: urllib3<3,>=1.21.1 in d:\anaconda\lib\site-packages (from requests->h2o) (2.2.3)
Requirement already satisfied: certifi>=2017.4.17 in d:\anaconda\lib\site-packages (from requests->h2o) (2024.8.30)
WARNING: Skipping D:\anaconda\Lib\site-packages\scipy-1.14.1.dist-info due to invalid metadata entry 'name'
WARNING: Skipping D:\anaconda\Lib\site-packages\scipy-1.14.1.dist-info due to invalid metadata entry 'name'
WARNING: Skipping D:\anaconda\Lib\site-packages\scipy-1.14.1.dist-info due to invalid metadata entry 'name'
WARNING: Skipping D:\anaconda\Lib\site-packages\scipy-1.14.1.dist-info due to invalid metadata entry 'name'
In [52]:
import h2o
from h2o.frame import H2OFrame
from h2o.automl import H2OAutoML
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, roc_auc_score
In [54]:
# Инициализируем H2O
h2o.init()
Checking whether there is an H2O instance running at http://localhost:54321. connected.
Warning: Your H2O cluster version is (4 months and 22 days) old.  There may be a newer version available.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
H2O_cluster_uptime: 43 secs
H2O_cluster_timezone: Europe/Moscow
H2O_data_parsing_timezone: UTC
H2O_cluster_version: 3.46.0.6
H2O_cluster_version_age: 4 months and 22 days
H2O_cluster_name: H2O_from_python_25150_atuyo6
H2O_cluster_total_nodes: 1
H2O_cluster_free_memory: 7.882 Gb
H2O_cluster_total_cores: 32
H2O_cluster_allowed_cores: 32
H2O_cluster_status: locked, healthy
H2O_connection_url: http://localhost:54321
H2O_connection_proxy: {"http": null, "https": null}
H2O_internal_security: False
Python_version: 3.11.5 final
In [55]:
#数据格式转换、数据集划分、指定目标变量和特征列

# Переводим датафрейм в формат H2OFrame 
# 把 Pandas DataFrame(比如 autoML_df)转换为 H2O 使用的专有数据格式 H2OFrame,这是 H2O 建模的输入格式
h2o_df = H2OFrame(autoML_df)

#h2o_df['Healthy_Status'] = h2o_df['Healthy_Status'].asfactor()#这样 H2O 就会知道这是一个二分类问题,而不是连续值回归。

# Разбиение на обучающую и тестовую выборки
# 把 H2O 数据集 h2o_df 按照 85% : 15% 的比例,随机划分为训练集(train)和测试集(test)
random_seed = 17
train, test = h2o_df.split_frame(ratios=[0.85], seed=random_seed)

# Определяем целевую переменную и предикторы
# 指定 H2O 自动建模中的“目标变量”和“输入特征”
y="Healthy_Status" # 你要预测的字段(即分类或回归目标)
x = list(h2o_df.columns) # 获取所有列名
x.remove(y) # 把目标变量 y 从特征列表中移除,剩下的就是输入变量(predictors)
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
In [56]:
# 自动训练多种机器学习模型并自动选择最佳模型,然后在测试集上评估性能
# Создаем и обучаем модель H2O AutoML
automl = H2OAutoML(max_runtime_secs=60, seed=random_seed, verbosity="info")
# 最多训练 60 秒(控制时间,而不是模型数量);最多训练 60 秒(控制时间,而不是模型数量);最多训练 60 秒(控制时间,而不是模型数量);
#automl= H2OAutoML(max_models = 10, seed = 10, exclude_algos = ["StackedEnsemble", "DeepLearning"], verbosity="info", nfolds=0)


automl.train(x=x, y=y, training_frame=train)

# Оценка модели на тестовом наборе
performance = automl.leader.model_performance(test_data=test)
print(performance)
AutoML progress: |█
00:50:08.271: Project: AutoML_1_20250325_05008
00:50:08.272: 5-fold cross-validation will be used.
00:50:08.272: Setting stopping tolerance adaptively based on the training frame: 0.018464772811525407
00:50:08.272: Build control seed: 17
00:50:08.273: training frame: Frame key: AutoML_1_20250325_05008_training_py_2_sid_94de    cols: 10    rows: 2933  chunks: 1    size: 68508  checksum: 120712254201011546
00:50:08.273: validation frame: NULL
00:50:08.273: leaderboard frame: NULL
00:50:08.273: blending frame: NULL
00:50:08.273: response column: Healthy_Status
00:50:08.273: fold column: null
00:50:08.273: weights column: null
00:50:08.282: AutoML: XGBoost is not available; skipping it.
00:50:08.290: Loading execution steps: [{XGBoost : [def_2 (1g, 10w), def_1 (2g, 10w), def_3 (3g, 10w), grid_1 (4g, 90w), lr_search (6g, 30w)]}, {GLM : [def_1 (1g, 10w)]}, {DRF : [def_1 (2g, 10w), XRT (3g, 10w)]}, {GBM : [def_5 (1g, 10w), def_2 (2g, 10w), def_3 (2g, 10w), def_4 (2g, 10w), def_1 (3g, 10w), grid_1 (4g, 60w), lr_annealing (6g, 10w)]}, {DeepLearning : [def_1 (3g, 10w), grid_1 (4g, 30w), grid_2 (5g, 30w), grid_3 (5g, 30w)]}, {completion : [resume_best_grids (10g, 60w)]}, {StackedEnsemble : [best_of_family_1 (1g, 5w), best_of_family_2 (2g, 5w), best_of_family_3 (3g, 5w), best_of_family_4 (4g, 5w), best_of_family_5 (5g, 5w), all_2 (2g, 10w), all_3 (3g, 10w), all_4 (4g, 10w), all_5 (5g, 10w), monotonic (6g, 10w), best_of_family_gbm (6g, 10w), all_gbm (7g, 10w), best_of_family_xglm (8g, 10w), all_xglm (8g, 10w), best_of_family (10g, 10w), best_N (10g, 10w)]}]
00:50:08.307: Disabling Algo: XGBoost as requested by the user.
00:50:08.308: AutoML job created: 2025.03.25 00:50:08.251
00:50:08.308: AutoML build started: 2025.03.25 00:50:08.308
00:50:08.324: AutoML: starting GLM_1_AutoML_1_20250325_05008 model training
00:50:08.324: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:50:09.105: New leader: GLM_1_AutoML_1_20250325_05008, rmse: 0.2865832550349015
00:50:09.115: AutoML: starting GBM_1_AutoML_1_20250325_05008 model training
00:50:09.115: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:50:12.646: New leader: GBM_1_AutoML_1_20250325_05008, rmse: 0.12946059420121422

██
00:50:12.710: AutoML: starting StackedEnsemble_BestOfFamily_1_AutoML_1_20250325_05008 model training
00:50:12.710: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:50:12.906: New leader: StackedEnsemble_BestOfFamily_1_AutoML_1_20250325_05008, rmse: 0.12939553821337055
00:50:12.929: AutoML: starting DRF_1_AutoML_1_20250325_05008 model training
00:50:12.930: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:50:14.209: New leader: DRF_1_AutoML_1_20250325_05008, rmse: 0.1288446703068968
00:50:14.210: AutoML: starting GBM_2_AutoML_1_20250325_05008 model training
00:50:14.210: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:50:14.841: New leader: GBM_2_AutoML_1_20250325_05008, rmse: 0.1128652612873039
00:50:14.842: AutoML: starting GBM_3_AutoML_1_20250325_05008 model training
00:50:14.842: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:50:15.434: AutoML: starting GBM_4_AutoML_1_20250325_05008 model training
00:50:15.434: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:50:16.61: AutoML: starting StackedEnsemble_BestOfFamily_2_AutoML_1_20250325_05008 model training
00:50:16.61: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:50:16.203: AutoML: starting StackedEnsemble_AllModels_1_AutoML_1_20250325_05008 model training
00:50:16.204: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:50:16.378: New leader: StackedEnsemble_AllModels_1_AutoML_1_20250325_05008, rmse: 0.11231078419053396
00:50:16.378: AutoML: starting XRT_1_AutoML_1_20250325_05008 model training
00:50:16.378: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:50:17.284: AutoML: starting GBM_5_AutoML_1_20250325_05008 model training
00:50:17.285: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.

███
00:50:18.165: New leader: GBM_5_AutoML_1_20250325_05008, rmse: 0.10833656809442252
00:50:18.171: AutoML: starting DeepLearning_1_AutoML_1_20250325_05008 model training
00:50:18.172: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:50:20.32: AutoML: starting StackedEnsemble_BestOfFamily_3_AutoML_1_20250325_05008 model training
00:50:20.33: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:50:20.199: AutoML: starting StackedEnsemble_AllModels_2_AutoML_1_20250325_05008 model training
00:50:20.199: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:50:20.355: New leader: StackedEnsemble_AllModels_2_AutoML_1_20250325_05008, rmse: 0.1082561546805813
00:50:20.359: AutoML: starting GBM_grid_1_AutoML_1_20250325_05008 hyperparameter search

███████████████████████████████
00:50:47.909: AutoML: starting DeepLearning_grid_1_AutoML_1_20250325_05008 hyperparameter search

████████████████
00:51:01.713: AutoML: starting StackedEnsemble_AllModels_3_AutoML_1_20250325_05008 model training
00:51:01.713: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:51:01.981: New leader: StackedEnsemble_AllModels_3_AutoML_1_20250325_05008, rmse: 0.10679729316052826
00:51:01.982: AutoML: starting DeepLearning_grid_2_AutoML_1_20250325_05008 hyperparameter search

█████
00:51:04.580: AutoML: starting DeepLearning_grid_3_AutoML_1_20250325_05008 hyperparameter search
00:51:07.172: AutoML: starting StackedEnsemble_BestOfFamily_4_AutoML_1_20250325_05008 model training
00:51:07.172: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.

█████| (done) 100%

00:51:07.328: AutoML: starting StackedEnsemble_AllModels_4_AutoML_1_20250325_05008 model training
00:51:07.328: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:51:07.557: Retraining best GBM with learning rate annealing: GBM_5_AutoML_1_20250325_05008
00:51:07.557: AutoML: starting GBM_lr_annealing_selection_AutoML_1_20250325_05008_select_model model training
00:51:07.559: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:51:08.531: Actual modeling steps: [{GLM : [def_1 (1g, 10w)]}, {GBM : [def_5 (1g, 10w)]}, {StackedEnsemble : [best_of_family_1 (1g, 5w)]}, {DRF : [def_1 (2g, 10w)]}, {GBM : [def_2 (2g, 10w), def_3 (2g, 10w), def_4 (2g, 10w)]}, {StackedEnsemble : [best_of_family_2 (2g, 5w), all_2 (2g, 10w)]}, {DRF : [XRT (3g, 10w)]}, {GBM : [def_1 (3g, 10w)]}, {DeepLearning : [def_1 (3g, 10w)]}, {StackedEnsemble : [best_of_family_3 (3g, 5w), all_3 (3g, 10w)]}, {GBM : [grid_1 (4g, 60w)]}, {DeepLearning : [grid_1 (4g, 30w)]}, {StackedEnsemble : [all_4 (4g, 10w)]}, {DeepLearning : [grid_2 (5g, 30w), grid_3 (5g, 30w)]}, {StackedEnsemble : [best_of_family_5 (5g, 5w), all_5 (5g, 10w)]}, {GBM : [lr_annealing (6g, 10w)]}]
00:51:08.531: AutoML build stopped: 2025.03.25 00:51:08.531
00:51:08.531: AutoML build done: built 43 models
00:51:08.531: AutoML duration:  1 min  0.223 sec

ModelMetricsRegressionGLM: stackedensemble
** Reported on test data. **

MSE: 0.005003135627156996
RMSE: 0.0707328468758115
MAE: 0.03130546386615607
RMSLE: 0.050631346579269945
Mean Residual Deviance: 0.005003135627156996
R^2: 0.9727803764776879
Null degrees of freedom: 514
Residual degrees of freedom: 506
Null deviance: 95.56748634029051
Residual deviance: 2.576614847985853
AIC: -1246.8038862031594
In [57]:
# 用来查看 H2O AutoML 所训练的所有模型的排行榜(leaderboard)
lb = automl.leaderboard
lb.head() # 查看排行榜中最靠前的几条记录(默认前 10 个模型)
Out[57]:
model_id rmse mse mae rmsle mean_residual_deviance
StackedEnsemble_AllModels_3_AutoML_1_20250325_05008 0.1067970.01140570.03906130.0757921 0.0114057
StackedEnsemble_AllModels_4_AutoML_1_20250325_05008 0.1075620.01156960.039639 0.076378 0.0115696
StackedEnsemble_AllModels_2_AutoML_1_20250325_05008 0.1082560.01171940.03656980.0768915 0.0117194
GBM_5_AutoML_1_20250325_05008 0.1083370.01173680.03806860.0774662 0.0117368
StackedEnsemble_BestOfFamily_4_AutoML_1_20250325_050080.1084820.01176840.03841240.0774874 0.0117684
StackedEnsemble_BestOfFamily_3_AutoML_1_20250325_050080.1085060.01177350.03815670.0775736 0.0117735
GBM_grid_1_AutoML_1_20250325_05008_model_10 0.1111030.01234390.03792810.0789915 0.0123439
GBM_grid_1_AutoML_1_20250325_05008_model_16 0.1120890.01256390.03242810.0792078 0.0125639
StackedEnsemble_AllModels_1_AutoML_1_20250325_05008 0.1123110.01261370.03897860.0799503 0.0126137
GBM_grid_1_AutoML_1_20250325_05008_model_4 0.1127330.01270870.04174760.080158 0.0127087
[10 rows x 6 columns]
In [58]:
#数据格式转换、数据集划分、指定目标变量和特征列

# Переводим датафрейм в формат H2OFrame 
# 把 Pandas DataFrame(比如 autoML_df)转换为 H2O 使用的专有数据格式 H2OFrame,这是 H2O 建模的输入格式
h2o_df = H2OFrame(autoML_df)

h2o_df['Healthy_Status'] = h2o_df['Healthy_Status'].asfactor()#这样 H2O 就会知道这是一个二分类问题,而不是连续值回归。

# Разбиение на обучающую и тестовую выборки
# 把 H2O 数据集 h2o_df 按照 85% : 15% 的比例,随机划分为训练集(train)和测试集(test)
random_seed = 17
train, test = h2o_df.split_frame(ratios=[0.85], seed=random_seed)

# Определяем целевую переменную и предикторы
# 指定 H2O 自动建模中的“目标变量”和“输入特征”
y="Healthy_Status" # 你要预测的字段(即分类或回归目标)
x = list(h2o_df.columns) # 获取所有列名
x.remove(y) # 把目标变量 y 从特征列表中移除,剩下的就是输入变量(predictors)
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
In [59]:
# 自动训练多种机器学习模型并自动选择最佳模型,然后在测试集上评估性能
# Создаем и обучаем модель H2O AutoML
automl = H2OAutoML(max_runtime_secs=60, seed=random_seed, verbosity="info")
# 最多训练 60 秒(控制时间,而不是模型数量);最多训练 60 秒(控制时间,而不是模型数量);最多训练 60 秒(控制时间,而不是模型数量);
#automl= H2OAutoML(max_models = 10, seed = 10, exclude_algos = ["StackedEnsemble", "DeepLearning"], verbosity="info", nfolds=0)


automl.train(x=x, y=y, training_frame=train)

# Оценка модели на тестовом наборе
performance = automl.leader.model_performance(test_data=test)
print(performance)
AutoML progress: |█
00:52:58.706: Project: AutoML_2_20250325_05258
00:52:58.707: 5-fold cross-validation will be used.
00:52:58.707: Setting stopping tolerance adaptively based on the training frame: 0.018464772811525407
00:52:58.707: Build control seed: 17
00:52:58.707: training frame: Frame key: AutoML_2_20250325_05258_training_py_10_sid_94de    cols: 10    rows: 2933  chunks: 1    size: 69056  checksum: 120712254201011546
00:52:58.707: validation frame: NULL
00:52:58.707: leaderboard frame: NULL
00:52:58.707: blending frame: NULL
00:52:58.707: response column: Healthy_Status
00:52:58.707: fold column: null
00:52:58.707: weights column: null
00:52:58.707: AutoML: XGBoost is not available; skipping it.
00:52:58.707: Loading execution steps: [{XGBoost : [def_2 (1g, 10w), def_1 (2g, 10w), def_3 (3g, 10w), grid_1 (4g, 90w), lr_search (6g, 30w)]}, {GLM : [def_1 (1g, 10w)]}, {DRF : [def_1 (2g, 10w), XRT (3g, 10w)]}, {GBM : [def_5 (1g, 10w), def_2 (2g, 10w), def_3 (2g, 10w), def_4 (2g, 10w), def_1 (3g, 10w), grid_1 (4g, 60w), lr_annealing (6g, 10w)]}, {DeepLearning : [def_1 (3g, 10w), grid_1 (4g, 30w), grid_2 (5g, 30w), grid_3 (5g, 30w)]}, {completion : [resume_best_grids (10g, 60w)]}, {StackedEnsemble : [best_of_family_1 (1g, 5w), best_of_family_2 (2g, 5w), best_of_family_3 (3g, 5w), best_of_family_4 (4g, 5w), best_of_family_5 (5g, 5w), all_2 (2g, 10w), all_3 (3g, 10w), all_4 (4g, 10w), all_5 (5g, 10w), monotonic (6g, 10w), best_of_family_gbm (6g, 10w), all_gbm (7g, 10w), best_of_family_xglm (8g, 10w), all_xglm (8g, 10w), best_of_family (10g, 10w), best_N (10g, 10w)]}]
00:52:58.708: Disabling Algo: XGBoost as requested by the user.
00:52:58.708: AutoML job created: 2025.03.25 00:52:58.706
00:52:58.709: AutoML build started: 2025.03.25 00:52:58.709
00:52:58.710: AutoML: starting GLM_1_AutoML_2_20250325_05258 model training
00:52:59.230: New leader: GLM_1_AutoML_2_20250325_05258, auc: 0.9848418510928573
00:52:59.233: AutoML: starting GBM_1_AutoML_2_20250325_05258 model training
00:53:00.955: New leader: GBM_1_AutoML_2_20250325_05258, auc: 0.9964776833368534
00:53:00.956: AutoML: starting StackedEnsemble_BestOfFamily_1_AutoML_2_20250325_05258 model training
00:53:01.126: AutoML: starting DRF_1_AutoML_2_20250325_05258 model training
00:53:01.885: AutoML: starting GBM_2_AutoML_2_20250325_05258 model training
00:53:02.768: New leader: GBM_2_AutoML_2_20250325_05258, auc: 0.9970530816346334
00:53:02.768: AutoML: starting GBM_3_AutoML_2_20250325_05258 model training

██
00:53:03.735: AutoML: starting GBM_4_AutoML_2_20250325_05258 model training
00:53:04.748: AutoML: starting StackedEnsemble_BestOfFamily_2_AutoML_2_20250325_05258 model training
00:53:05.11: AutoML: starting StackedEnsemble_AllModels_1_AutoML_2_20250325_05258 model training
00:53:05.166: AutoML: starting XRT_1_AutoML_2_20250325_05258 model training
00:53:05.679: AutoML: starting GBM_5_AutoML_2_20250325_05258 model training
00:53:06.581: New leader: GBM_5_AutoML_2_20250325_05258, auc: 0.9973356432987219
00:53:06.581: AutoML: starting DeepLearning_1_AutoML_2_20250325_05258 model training
00:53:07.3: AutoML: starting StackedEnsemble_BestOfFamily_3_AutoML_2_20250325_05258 model training
00:53:07.279: AutoML: starting StackedEnsemble_AllModels_2_AutoML_2_20250325_05258 model training
00:53:07.558: AutoML: starting GBM_grid_1_AutoML_2_20250325_05258 hyperparameter search

███████████████████████████
00:53:27.858: New leader: GBM_grid_1_AutoML_2_20250325_05258_model_10, auc: 0.9973992910270975

████████
00:53:36.928: AutoML: starting DeepLearning_grid_1_AutoML_2_20250325_05258 hyperparameter search

██████████████
00:53:51.460: AutoML: starting StackedEnsemble_BestOfFamily_4_AutoML_2_20250325_05258 model training
00:53:51.743: AutoML: starting StackedEnsemble_AllModels_3_AutoML_2_20250325_05258 model training

█████
00:53:52.72: AutoML: starting DeepLearning_grid_2_AutoML_2_20250325_05258 hyperparameter search
00:53:54.829: AutoML: starting DeepLearning_grid_3_AutoML_2_20250325_05258 hyperparameter search

█████
00:53:57.690: AutoML: starting StackedEnsemble_AllModels_4_AutoML_2_20250325_05258 model training
00:53:58.74: Retraining best GBM with learning rate annealing: GBM_grid_1_AutoML_2_20250325_05258_model_10
00:53:58.74: AutoML: starting GBM_lr_annealing_selection_AutoML_2_20250325_05258_select_model model training
00:53:59.983: Actual modeling steps: [{GLM : [def_1 (1g, 10w)]}, {GBM : [def_5 (1g, 10w)]}, {StackedEnsemble : [best_of_family_1 (1g, 5w)]}, {DRF : [def_1 (2g, 10w)]}, {GBM : [def_2 (2g, 10w), def_3 (2g, 10w), def_4 (2g, 10w)]}, {StackedEnsemble : [best_of_family_2 (2g, 5w), all_2 (2g, 10w)]}, {DRF : [XRT (3g, 10w)]}, {GBM : [def_1 (3g, 10w)]}, {DeepLearning : [def_1 (3g, 10w)]}, {StackedEnsemble : [best_of_family_3 (3g, 5w), all_3 (3g, 10w)]}, {GBM : [grid_1 (4g, 60w)]}, {DeepLearning : [grid_1 (4g, 30w)]}, {StackedEnsemble : [best_of_family_4 (4g, 5w), all_4 (4g, 10w)]}, {DeepLearning : [grid_2 (5g, 30w), grid_3 (5g, 30w)]}, {StackedEnsemble : [all_5 (5g, 10w)]}, {GBM : [lr_annealing (6g, 10w)]}]
00:53:59.983: AutoML build stopped: 2025.03.25 00:53:59.983
00:53:59.983: AutoML build done: built 35 models
00:53:59.983: AutoML duration:  1 min  1.274 sec

█| (done) 100%
ModelMetricsBinomial: gbm
** Reported on test data. **

MSE: 0.0056893233587090985
RMSE: 0.07542760342678997
LogLoss: 0.01841294346956109
Mean Per-Class Error: 0.0038461538461538464
AUC: 0.9998153846153847
AUCPR: 0.9994308033611844
Gini: 0.9996307692307693

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.1346544244507371
       0    1    Error    Rate
-----  ---  ---  -------  -----------
0      387  3    0.0077   (3.0/390.0)
1      0    125  0        (0.0/125.0)
Total  387  128  0.0058   (3.0/515.0)

Maximum Metrics: Maximum metrics at their respective thresholds
metric                       threshold    value     idx
---------------------------  -----------  --------  -----
max f1                       0.134654     0.988142  115
max f2                       0.134654     0.995223  115
max f0point5                 0.666699     0.995106  109
max accuracy                 0.666699     0.994175  109
max precision                0.999298     1         0
max recall                   0.134654     1         115
max specificity              0.999298     1         0
max absolute_mcc             0.134654     0.984404  115
max min_per_class_accuracy   0.134654     0.992308  115
max mean_per_class_accuracy  0.134654     0.996154  115
max tns                      0.999298     390       0
max fns                      0.999298     123       0
max fps                      0.000157684  390       399
max tps                      0.134654     125       115
max tnr                      0.999298     1         0
max fnr                      0.999298     0.984     0
max fpr                      0.000157684  1         399
max tpr                      0.134654     1         115

Gains/Lift Table: Avg response rate: 24.27 %, avg score: 23.96 %
group    cumulative_data_fraction    lower_threshold    lift     cumulative_lift    response_rate    score        cumulative_response_rate    cumulative_score    capture_rate    cumulative_capture_rate    gain     cumulative_gain    kolmogorov_smirnov
-------  --------------------------  -----------------  -------  -----------------  ---------------  -----------  --------------------------  ------------------  --------------  -------------------------  -------  -----------------  --------------------
1        0.0116505                   0.999053           4.12     4.12               1                0.999143     1                           0.999143            0.048           0.048                      312      312                0.048
2        0.0213592                   0.999              4.12     4.12               1                0.999011     1                           0.999083            0.04            0.088                      312      312                0.088
3        0.031068                    0.998884           4.12     4.12               1                0.99894      1                           0.999038            0.04            0.128                      312      312                0.128
4        0.0407767                   0.998811           4.12     4.12               1                0.998846     1                           0.998993            0.04            0.168                      312      312                0.168
5        0.0504854                   0.998726           4.12     4.12               1                0.998765     1                           0.998949            0.04            0.208                      312      312                0.208
6        0.100971                    0.998496           4.12     4.12               1                0.998611     1                           0.99878             0.208           0.416                      312      312                0.416
7        0.151456                    0.998089           4.12     4.12               1                0.998334     1                           0.998631            0.208           0.624                      312      312                0.624
8        0.2                         0.995943           4.12     4.12               1                0.997439     1                           0.998342            0.2             0.824                      312      312                0.824
9        0.300971                    0.00554024         1.74308  3.32258            0.423077         0.386794     0.806452                    0.793177            0.176           1                          74.3077  232.258            0.923077
10       0.4                         0.00237039         0        2.5                0                0.0038733    0.606796                    0.597767            0               1                          -100     150                0.792308
11       0.500971                    0.00132393         0        1.99612            0                0.00177338   0.484496                    0.477644            0               1                          -100     99.6124            0.658974
12       0.6                         0.000788305        0        1.66667            0                0.00102156   0.404531                    0.398979            0               1                          -100     66.6667            0.528205
13       0.699029                    0.000568038        0        1.43056            0                0.000656848  0.347222                    0.34255             0               1                          -100     43.0556            0.397436
14       0.8                         0.000400525        0        1.25               0                0.000481929  0.303398                    0.299376            0               1                          -100     25                 0.264103
15       0.899029                    0.000267807        0        1.11231            0                0.000330957  0.269978                    0.266436            0               1                          -100     11.2311            0.133333
16       1                           0.000157363        0        1                  0                0.000217108  0.242718                    0.239556            0               1                          -100     0                  0
In [60]:
# 用来查看 H2O AutoML 所训练的所有模型的排行榜(leaderboard)
lb = automl.leaderboard
lb.head() # 查看排行榜中最靠前的几条记录(默认前 10 个模型)
Out[60]:
model_id auc logloss aucpr mean_per_class_error rmse mse
GBM_grid_1_AutoML_2_20250325_05258_model_10 0.9973990.04565750.995637 0.02214370.109083 0.011899
GBM_5_AutoML_2_20250325_05258 0.9973360.04389690.995712 0.02251020.107085 0.0114673
GBM_grid_1_AutoML_2_20250325_05258_model_5 0.9973030.04210130.995893 0.02287070.104793 0.0109815
StackedEnsemble_AllModels_3_AutoML_2_20250325_052580.9972890.03951680.996057 0.01819040.100036 0.0100072
StackedEnsemble_AllModels_4_AutoML_2_20250325_052580.9972160.03949780.995998 0.01770780.09992240.00998449
GBM_grid_1_AutoML_2_20250325_05258_model_4 0.9971740.0419 0.995587 0.02059160.10349 0.0107102
GBM_grid_1_AutoML_2_20250325_05258_model_14 0.9970930.04505410.995276 0.020109 0.107151 0.0114814
GBM_grid_1_AutoML_2_20250325_05258_model_6 0.9970710.04758950.995481 0.02370780.10739 0.0115326
GBM_2_AutoML_2_20250325_05258 0.9970530.04660260.995157 0.023109 0.109112 0.0119054
GBM_4_AutoML_2_20250325_05258 0.9969910.04653590.995036 0.02130660.10774 0.0116078
[10 rows x 7 columns]
In [61]:
# AutoML 训练完成后生成的模型排行榜
model_ids = list(automl.leaderboard['model_id'].as_data_frame().iloc[:,0])
model_ids
D:\anaconda\Lib\site-packages\h2o\frame.py:1983: H2ODependencyWarning: Converting H2O frame to pandas dataframe using single-thread.  For faster conversion using multi-thread, install polars and pyarrow and use it as pandas_df = h2o_df.as_data_frame(use_multi_thread=True)

  warnings.warn("Converting H2O frame to pandas dataframe using single-thread.  For faster conversion using"
Out[61]:
['GBM_grid_1_AutoML_2_20250325_05258_model_10',
 'GBM_5_AutoML_2_20250325_05258',
 'GBM_grid_1_AutoML_2_20250325_05258_model_5',
 'StackedEnsemble_AllModels_3_AutoML_2_20250325_05258',
 'StackedEnsemble_AllModels_4_AutoML_2_20250325_05258',
 'GBM_grid_1_AutoML_2_20250325_05258_model_4',
 'GBM_grid_1_AutoML_2_20250325_05258_model_14',
 'GBM_grid_1_AutoML_2_20250325_05258_model_6',
 'GBM_2_AutoML_2_20250325_05258',
 'GBM_4_AutoML_2_20250325_05258',
 'StackedEnsemble_BestOfFamily_4_AutoML_2_20250325_05258',
 'StackedEnsemble_BestOfFamily_3_AutoML_2_20250325_05258',
 'GBM_grid_1_AutoML_2_20250325_05258_model_9',
 'GBM_3_AutoML_2_20250325_05258',
 'GBM_grid_1_AutoML_2_20250325_05258_model_3',
 'StackedEnsemble_AllModels_2_AutoML_2_20250325_05258',
 'StackedEnsemble_AllModels_1_AutoML_2_20250325_05258',
 'GBM_grid_1_AutoML_2_20250325_05258_model_15',
 'GBM_grid_1_AutoML_2_20250325_05258_model_2',
 'GBM_1_AutoML_2_20250325_05258',
 'StackedEnsemble_BestOfFamily_2_AutoML_2_20250325_05258',
 'StackedEnsemble_BestOfFamily_1_AutoML_2_20250325_05258',
 'GBM_grid_1_AutoML_2_20250325_05258_model_13',
 'GBM_grid_1_AutoML_2_20250325_05258_model_8',
 'GBM_grid_1_AutoML_2_20250325_05258_model_11',
 'GBM_grid_1_AutoML_2_20250325_05258_model_12',
 'GBM_grid_1_AutoML_2_20250325_05258_model_1',
 'XRT_1_AutoML_2_20250325_05258',
 'DRF_1_AutoML_2_20250325_05258',
 'GBM_grid_1_AutoML_2_20250325_05258_model_16',
 'GBM_grid_1_AutoML_2_20250325_05258_model_7',
 'DeepLearning_grid_1_AutoML_2_20250325_05258_model_1',
 'DeepLearning_grid_2_AutoML_2_20250325_05258_model_1',
 'DeepLearning_1_AutoML_2_20250325_05258',
 'DeepLearning_grid_1_AutoML_2_20250325_05258_model_2',
 'DeepLearning_grid_1_AutoML_2_20250325_05258_model_3',
 'DeepLearning_grid_3_AutoML_2_20250325_05258_model_1',
 'DeepLearning_grid_2_AutoML_2_20250325_05258_model_2',
 'DeepLearning_grid_3_AutoML_2_20250325_05258_model_2',
 'DeepLearning_grid_1_AutoML_2_20250325_05258_model_4',
 'DeepLearning_grid_3_AutoML_2_20250325_05258_model_3',
 'GLM_1_AutoML_2_20250325_05258',
 'DeepLearning_grid_2_AutoML_2_20250325_05258_model_3']
In [66]:
# 从 AutoML 模型排行榜中筛选出一个 XGBoost 模型,并将其 H2O 格式的参数转换为原生 XGBoost 可识别的格式
out = h2o.get_model([mid for mid in model_ids if "XGBoost" in mid][0])
out.convert_H2OXGBoostParams_2_XGBoostParams()
# H2O 现在的后端确实支持 XGBoost(所以你能创建 H2OXGBoostEstimator()),
# 但在这一次 AutoML 过程中,并没有实际训练出 XGBoost 模型,从而排行榜里根本没有以 “XGBoost_...” 命名的模型。
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In[66], line 2
      1 # 从 AutoML 模型排行榜中筛选出一个 XGBoost 模型,并将其 H2O 格式的参数转换为原生 XGBoost 可识别的格式
----> 2 out = h2o.get_model([mid for mid in model_ids if "XGBoost" in mid][0])
      3 out.convert_H2OXGBoostParams_2_XGBoostParams()

IndexError: list index out of range
In [63]:
from h2o.estimators import H2OXGBoostEstimator
try:
    m = H2OXGBoostEstimator()
    print("✅ XGBoost 可用")
except Exception as e:
    print("❌ XGBoost 不可用:", e)
✅ XGBoost 可用
In [65]:
# 直接训练 XGBoost 模型
import h2o
from h2o.estimators import H2OXGBoostEstimator

# 如果还没连接到手动启动的 H2O
h2o.init(ip="localhost", port=54321)  # start_h2o=False 如果已经有集群在运行

# 假设你已经有训练集 train_h2o (类型是 H2OFrame),x为特征列表,y为目标列
# 例:
# x = ["col1", "col2", ...]
# y = "target"

xgb = H2OXGBoostEstimator(
    ntrees=100,
    max_depth=5,
    learn_rate=0.1,
    seed=42
)
xgb.train(x=x, y=y, training_frame=train_h2o)

# 评估模型
perf = xgb.model_performance(valid=True)  # 如果有验证集的话
print(perf)
Checking whether there is an H2O instance running at http://localhost:54321. connected.
Warning: Your H2O cluster version is (4 months and 22 days) old.  There may be a newer version available.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
H2O_cluster_uptime: 11 mins 42 secs
H2O_cluster_timezone: Europe/Moscow
H2O_data_parsing_timezone: UTC
H2O_cluster_version: 3.46.0.6
H2O_cluster_version_age: 4 months and 22 days
H2O_cluster_name: H2O_from_python_25150_atuyo6
H2O_cluster_total_nodes: 1
H2O_cluster_free_memory: 7.745 Gb
H2O_cluster_total_cores: 32
H2O_cluster_allowed_cores: 32
H2O_cluster_status: locked, healthy
H2O_connection_url: http://localhost:54321
H2O_connection_proxy: {"http": null, "https": null}
H2O_internal_security: False
Python_version: 3.11.5 final
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[65], line 19
      8 # 假设你已经有训练集 train_h2o (类型是 H2OFrame),x为特征列表,y为目标列
      9 # 例:
     10 # x = ["col1", "col2", ...]
     11 # y = "target"
     13 xgb = H2OXGBoostEstimator(
     14     ntrees=100,
     15     max_depth=5,
     16     learn_rate=0.1,
     17     seed=42
     18 )
---> 19 xgb.train(x=x, y=y, training_frame=train_h2o)
     21 # 评估模型
     22 perf = xgb.model_performance(valid=True)  # 如果有验证集的话

NameError: name 'train_h2o' is not defined
In [64]:
# 在 AutoML 中启用 XGBoost
from h2o.automl import H2OAutoML

automl = H2OAutoML(
    max_runtime_secs=60,
    seed=42,
    include_algos=["XGBoost", "GBM", "GLM", "DRF", "DeepLearning", "StackedEnsemble"],
    verbosity="info"
)
automl.train(x=x, y=y, training_frame=train_h2o)

lb = automl.leaderboard
lb.head()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[64], line 10
      2 from h2o.automl import H2OAutoML
      4 automl = H2OAutoML(
      5     max_runtime_secs=60,
      6     seed=42,
      7     include_algos=["XGBoost", "GBM", "GLM", "DRF", "DeepLearning", "StackedEnsemble"],
      8     verbosity="info"
      9 )
---> 10 automl.train(x=x, y=y, training_frame=train_h2o)
     12 lb = automl.leaderboard
     13 lb.head()

NameError: name 'train_h2o' is not defined
In [60]:
xgb_ids = [mid for mid in model_ids if "XGBoost" in mid]
print(xgb_ids)  # 如果是空的,说明没有 XGBoost 模型
[]
In [61]:
if xgb_ids:
    out = h2o.get_model(xgb_ids[0])
    params = out.convert_H2OXGBoostParams_2_XGBoostParams()
    print(params)
else:
    print("⚠️ 当前 AutoML 没有训练出 XGBoost 模型。")
⚠️ 当前 AutoML 没有训练出 XGBoost 模型。
In [62]:
import h2o
from h2o.estimators import H2OXGBoostEstimator

h2o.init()

# 检查是否可以创建 H2O 的 XGBoost 模型
try:
    model = H2OXGBoostEstimator()
    print("✅ 当前环境支持 H2O 的 XGBoost 模型。")
except Exception as e:
    print("❌ 当前环境不支持 H2O 的 XGBoost,原因:", e)
Checking whether there is an H2O instance running at http://localhost:54321. connected.
Warning: Your H2O cluster version is (4 months and 21 days) old.  There may be a newer version available.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
H2O_cluster_uptime: 11 hours 21 mins
H2O_cluster_timezone: Europe/Moscow
H2O_data_parsing_timezone: UTC
H2O_cluster_version: 3.46.0.6
H2O_cluster_version_age: 4 months and 21 days
H2O_cluster_name: H2O_from_python_25150_lkcgsm
H2O_cluster_total_nodes: 1
H2O_cluster_free_memory: 7.331 Gb
H2O_cluster_total_cores: 32
H2O_cluster_allowed_cores: 32
H2O_cluster_status: locked, healthy
H2O_connection_url: http://localhost:54321
H2O_connection_proxy: {"http": null, "https": null}
H2O_internal_security: False
Python_version: 3.11.5 final
✅ 当前环境支持 H2O 的 XGBoost 模型。
In [81]:
automl = H2OAutoML(
    max_runtime_secs=120,
    seed=42,
    include_algos=["XGBoost", "GBM", "GLM", "DRF", "DeepLearning", "StackedEnsemble"]
)
from h2o.estimators import H2OXGBoostEstimator

xgb = H2OXGBoostEstimator(ntrees=100, max_depth=5, learn_rate=0.1, seed=42)
xgb.train(x=x, y=y, training_frame=train)
xgb.varimp_plot()
xgb.convert_H2OXGBoostParams_2_XGBoostParams()
---------------------------------------------------------------------------
H2OResponseError                          Traceback (most recent call last)
Cell In[81], line 9
      6 from h2o.estimators import H2OXGBoostEstimator
      8 xgb = H2OXGBoostEstimator(ntrees=100, max_depth=5, learn_rate=0.1, seed=42)
----> 9 xgb.train(x=x, y=y, training_frame=train)
     10 xgb.varimp_plot()
     11 xgb.convert_H2OXGBoostParams_2_XGBoostParams()

File D:\anaconda\Lib\site-packages\h2o\estimators\estimator_base.py:107, in H2OEstimator.train(self, x, y, training_frame, offset_column, fold_column, weights_column, validation_frame, max_runtime_secs, ignored_columns, model_id, verbose)
     88 """
     89 Train the H2O model.
     90 
   (...)
    101 :param bool verbose: Print scoring history to stdout. Defaults to False.
    102 """
    103 parms = self._make_parms(x=x, y=y, training_frame=training_frame, offset_column=offset_column, 
    104                          fold_column=fold_column, weights_column=weights_column, 
    105                          validation_frame=validation_frame, max_runtime_secs=max_runtime_secs, 
    106                          ignored_columns=ignored_columns, model_id=model_id, verbose=verbose)
--> 107 self._train(parms, verbose=verbose)
    108 return self

File D:\anaconda\Lib\site-packages\h2o\estimators\estimator_base.py:186, in H2OEstimator._train(self, parms, verbose)
    183 assert_is_type(verbose, bool)
    185 rest_ver = self._get_rest_version(parms)
--> 186 model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms)
    187 job = H2OJob(model_builder_json, job_type=(self.algo + " Model Build"))
    189 if model_builder_json["messages"] is not None:

File D:\anaconda\Lib\site-packages\h2o\h2o.py:123, in api(endpoint, data, json, filename, save_to)
    121 # type checks are performed in H2OConnection class
    122 _check_connection()
--> 123 return h2oconn.request(endpoint, data=data, json=json, filename=filename, save_to=save_to)

File D:\anaconda\Lib\site-packages\h2o\backend\connection.py:499, in H2OConnection.request(self, endpoint, data, json, filename, save_to)
    497         save_to = save_to(resp)
    498     self._log_end_transaction(start_time, resp)
--> 499     return self._process_response(resp, save_to)
    501 except (requests.exceptions.ConnectionError, requests.exceptions.HTTPError) as e:
    502     if self._local_server and not self._local_server.is_running():

File D:\anaconda\Lib\site-packages\h2o\backend\connection.py:853, in H2OConnection._process_response(response, save_to)
    851 if status_code in {400, 404, 412} and isinstance(data, H2OErrorV3):
    852     data.show_stacktrace = False
--> 853     raise H2OResponseError(data)
    855 # Server errors (notably 500 = "Server Error")
    856 # Note that it is possible to receive valid H2OErrorV3 object in this case, however it merely means the server
    857 # did not provide the correct status code.
    858 raise H2OServerError("HTTP %d %s:\n%s" % (status_code, response.reason, data))

H2OResponseError: Server error water.exceptions.H2ONotFoundArgumentException:
  Error: POST /3/ModelBuilders/xgboost not found
  Request: POST /3/ModelBuilders/xgboost
    data: {'training_frame': 'py_37_sid_add6', 'nfolds': '0', 'keep_cross_validation_models': 'True', 'keep_cross_validation_predictions': 'False', 'keep_cross_validation_fold_assignment': 'False', 'score_each_iteration': 'False', 'fold_assignment': 'auto', 'response_column': 'Healthy_Status', 'ignore_const_cols': 'True', 'stopping_rounds': '0', 'stopping_metric': 'auto', 'stopping_tolerance': '0.001', 'max_runtime_secs': '0.0', 'seed': '42', 'distribution': 'auto', 'tweedie_power': '1.5', 'categorical_encoding': 'auto', 'quiet_mode': 'True', 'ntrees': '100', 'max_depth': '5', 'min_rows': '1.0', 'min_child_weight': '1.0', 'learn_rate': '0.1', 'eta': '0.3', 'sample_rate': '1.0', 'subsample': '1.0', 'col_sample_rate': '1.0', 'colsample_bylevel': '1.0', 'col_sample_rate_per_tree': '1.0', 'colsample_bytree': '1.0', 'colsample_bynode': '1.0', 'max_abs_leafnode_pred': '0.0', 'max_delta_step': '0.0', 'score_tree_interval': '0', 'min_split_improvement': '0.0', 'gamma': '0.0', 'nthread': '-1', 'build_tree_one_node': 'False', 'parallelize_cross_validation': 'True', 'calibrate_model': 'False', 'calibration_method': 'auto', 'max_bins': '256', 'max_leaves': '0', 'sample_type': 'uniform', 'normalize_type': 'tree', 'rate_drop': '0.0', 'one_drop': 'False', 'skip_drop': '0.0', 'tree_method': 'auto', 'grow_policy': 'depthwise', 'booster': 'gbtree', 'reg_lambda': '1.0', 'reg_alpha': '0.0', 'dmatrix_type': 'auto', 'backend': 'auto', 'gainslift_bins': '-1', 'auc_type': 'auto', 'scale_pos_weight': '1.0', 'score_eval_metric_only': 'False'}
In [63]:
# 自动检测并启用 XGBoost(如果可用),否则跳过。
from h2o.automl import H2OAutoML

automl = H2OAutoML(
    max_runtime_secs=60,
    seed=42,
    verbosity="info"
)
automl.train(x=x, y=y, training_frame=train)
AutoML progress: |█
01:27:50.94: Project: AutoML_10_20250324_12750
01:27:50.94: 5-fold cross-validation will be used.
01:27:50.94: Setting stopping tolerance adaptively based on the training frame: 0.018464772811525407
01:27:50.94: Build control seed: 42
01:27:50.94: training frame: Frame key: AutoML_10_20250324_12750_training_py_10_sid_baac    cols: 10    rows: 2933  chunks: 1    size: 69060  checksum: -6527674322480268147
01:27:50.94: validation frame: NULL
01:27:50.94: leaderboard frame: NULL
01:27:50.94: blending frame: NULL
01:27:50.94: response column: Healthy_Status
01:27:50.94: fold column: null
01:27:50.94: weights column: null
01:27:50.94: AutoML: XGBoost is not available; skipping it.
01:27:50.94: Loading execution steps: [{XGBoost : [def_2 (1g, 10w), def_1 (2g, 10w), def_3 (3g, 10w), grid_1 (4g, 90w), lr_search (6g, 30w)]}, {GLM : [def_1 (1g, 10w)]}, {DRF : [def_1 (2g, 10w), XRT (3g, 10w)]}, {GBM : [def_5 (1g, 10w), def_2 (2g, 10w), def_3 (2g, 10w), def_4 (2g, 10w), def_1 (3g, 10w), grid_1 (4g, 60w), lr_annealing (6g, 10w)]}, {DeepLearning : [def_1 (3g, 10w), grid_1 (4g, 30w), grid_2 (5g, 30w), grid_3 (5g, 30w)]}, {completion : [resume_best_grids (10g, 60w)]}, {StackedEnsemble : [best_of_family_1 (1g, 5w), best_of_family_2 (2g, 5w), best_of_family_3 (3g, 5w), best_of_family_4 (4g, 5w), best_of_family_5 (5g, 5w), all_2 (2g, 10w), all_3 (3g, 10w), all_4 (4g, 10w), all_5 (5g, 10w), monotonic (6g, 10w), best_of_family_gbm (6g, 10w), all_gbm (7g, 10w), best_of_family_xglm (8g, 10w), all_xglm (8g, 10w), best_of_family (10g, 10w), best_N (10g, 10w)]}]
01:27:50.95: Disabling Algo: XGBoost as requested by the user.
01:27:50.95: AutoML job created: 2025.03.24 01:27:50.94
01:27:50.102: AutoML build started: 2025.03.24 01:27:50.98
01:27:50.104: AutoML: starting GLM_1_AutoML_10_20250324_12750 model training
01:27:50.238: New leader: GLM_1_AutoML_10_20250324_12750, auc: 0.9868937054394548
01:27:50.238: AutoML: starting GBM_1_AutoML_10_20250324_12750 model training
01:27:51.228: New leader: GBM_1_AutoML_10_20250324_12750, auc: 0.996292163052351
01:27:51.229: AutoML: starting StackedEnsemble_BestOfFamily_1_AutoML_10_20250324_12750 model training
01:27:51.430: AutoML: starting DRF_1_AutoML_10_20250324_12750 model training
01:27:51.845: AutoML: starting GBM_2_AutoML_10_20250324_12750 model training
01:27:52.701: New leader: GBM_2_AutoML_10_20250324_12750, auc: 0.9971375647180377
01:27:52.701: AutoML: starting GBM_3_AutoML_10_20250324_12750 model training
01:27:53.713: New leader: GBM_3_AutoML_10_20250324_12750, auc: 0.9974458138061342
01:27:53.714: AutoML: starting GBM_4_AutoML_10_20250324_12750 model training

██
01:27:54.775: AutoML: starting StackedEnsemble_BestOfFamily_2_AutoML_10_20250324_12750 model training
01:27:54.936: New leader: StackedEnsemble_BestOfFamily_2_AutoML_10_20250324_12750, auc: 0.9975342927110507
01:27:54.937: AutoML: starting StackedEnsemble_AllModels_1_AutoML_10_20250324_12750 model training
01:27:55.96: AutoML: starting XRT_1_AutoML_10_20250324_12750 model training
01:27:55.562: AutoML: starting GBM_5_AutoML_10_20250324_12750 model training
01:27:56.274: AutoML: starting DeepLearning_1_AutoML_10_20250324_12750 model training
01:27:56.546: AutoML: starting StackedEnsemble_BestOfFamily_3_AutoML_10_20250324_12750 model training
01:27:56.704: AutoML: starting StackedEnsemble_AllModels_2_AutoML_10_20250324_12750 model training
01:27:56.865: AutoML: starting GBM_grid_1_AutoML_10_20250324_12750 hyperparameter search

████████████████████████
01:28:12.91: New leader: GBM_grid_1_AutoML_10_20250324_12750_model_9, auc: 0.9976005091818271

███████████
01:28:27.329: AutoML: starting DeepLearning_grid_1_AutoML_10_20250324_12750 hyperparameter search

████████████████
01:28:42.748: AutoML: starting StackedEnsemble_BestOfFamily_4_AutoML_10_20250324_12750 model training
01:28:43.61: AutoML: starting StackedEnsemble_AllModels_3_AutoML_10_20250324_12750 model training
01:28:43.408: New leader: StackedEnsemble_AllModels_3_AutoML_10_20250324_12750, auc: 0.9979652706027412
01:28:43.410: AutoML: starting DeepLearning_grid_2_AutoML_10_20250324_12750 hyperparameter search

█████
01:28:47.390: AutoML: starting DeepLearning_grid_3_AutoML_10_20250324_12750 hyperparameter search

████| (done) 100%

01:28:51.15: Actual modeling steps: [{GLM : [def_1 (1g, 10w)]}, {GBM : [def_5 (1g, 10w)]}, {StackedEnsemble : [best_of_family_1 (1g, 5w)]}, {DRF : [def_1 (2g, 10w)]}, {GBM : [def_2 (2g, 10w), def_3 (2g, 10w), def_4 (2g, 10w)]}, {StackedEnsemble : [best_of_family_2 (2g, 5w), all_2 (2g, 10w)]}, {DRF : [XRT (3g, 10w)]}, {GBM : [def_1 (3g, 10w)]}, {DeepLearning : [def_1 (3g, 10w)]}, {StackedEnsemble : [best_of_family_3 (3g, 5w), all_3 (3g, 10w)]}, {GBM : [grid_1 (4g, 60w)]}, {DeepLearning : [grid_1 (4g, 30w)]}, {StackedEnsemble : [best_of_family_4 (4g, 5w), all_4 (4g, 10w)]}, {DeepLearning : [grid_2 (5g, 30w), grid_3 (5g, 30w)]}]
01:28:51.15: AutoML build stopped: 2025.03.24 01:28:51.15
01:28:51.15: AutoML build done: built 34 models
01:28:51.15: AutoML duration:  1 min  0.917 sec

Out[63]:
Model Details
=============
H2OStackedEnsembleEstimator : Stacked Ensemble
Model Key: StackedEnsemble_AllModels_3_AutoML_10_20250324_12750
Model Summary for Stacked Ensemble:
key value
Stacking strategy cross_validation
Number of base models (used / total) 6/31
# GBM base models (used / total) 4/23
# DRF base models (used / total) 2/2
# DeepLearning base models (used / total) 0/5
# GLM base models (used / total) 0/1
Metalearner algorithm GLM
Metalearner fold assignment scheme Random
Metalearner nfolds 5
Metalearner fold_column None
Custom metalearner hyperparameters None
ModelMetricsBinomialGLM: stackedensemble
** Reported on train data. **

MSE: 9.240592103219748e-05
RMSE: 0.009612799853955011
LogLoss: 0.0029767531608037834
AUC: 1.0
AUCPR: 1.0
Gini: 1.0
Null degrees of freedom: 2932
Residual degrees of freedom: 2926
Null deviance: 3503.936615660394
Residual deviance: 17.46163404127499
AIC: 31.46163404127499
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.932231441206889
0 1 Error Rate
0 2098.0 0.0 0.0 (0.0/2098.0)
1 0.0 835.0 0.0 (0.0/835.0)
Total 2098.0 835.0 0.0 (0.0/2933.0)
Maximum Metrics: Maximum metrics at their respective thresholds
metric threshold value idx
max f1 0.9322314 1.0 122.0
max f2 0.9322314 1.0 122.0
max f0point5 0.9322314 1.0 122.0
max accuracy 0.9322314 1.0 122.0
max precision 0.9998620 1.0 0.0
max recall 0.9322314 1.0 122.0
max specificity 0.9998620 1.0 0.0
max absolute_mcc 0.9322314 1.0 122.0
max min_per_class_accuracy 0.9322314 1.0 122.0
max mean_per_class_accuracy 0.9322314 1.0 122.0
max tns 0.9998620 2098.0 0.0
max fns 0.9998620 822.0 0.0
max fps 0.0000647 2098.0 399.0
max tps 0.9322314 835.0 122.0
max tnr 0.9998620 1.0 0.0
max fnr 0.9998620 0.9844311 0.0
max fpr 0.0000647 1.0 399.0
max tpr 0.9322314 1.0 122.0
Gains/Lift Table: Avg response rate: 28.47 %, avg score: 28.63 %
group cumulative_data_fraction lower_threshold lift cumulative_lift response_rate score cumulative_response_rate cumulative_score capture_rate cumulative_capture_rate gain cumulative_gain kolmogorov_smirnov
1 0.0102284 0.9998396 3.5125749 3.5125749 1.0 0.9998528 1.0 0.9998528 0.0359281 0.0359281 251.2574850 251.2574850 0.0359281
2 0.0201159 0.9998267 3.5125749 3.5125749 1.0 0.9998318 1.0 0.9998425 0.0347305 0.0706587 251.2574850 251.2574850 0.0706587
3 0.0300034 0.9998215 3.5125749 3.5125749 1.0 0.9998242 1.0 0.9998365 0.0347305 0.1053892 251.2574850 251.2574850 0.1053892
4 0.0402318 0.9998150 3.5125749 3.5125749 1.0 0.9998181 1.0 0.9998318 0.0359281 0.1413174 251.2574850 251.2574850 0.1413174
5 0.0501193 0.9998089 3.5125749 3.5125749 1.0 0.9998116 1.0 0.9998278 0.0347305 0.1760479 251.2574850 251.2574850 0.1760479
6 0.1002387 0.9997828 3.5125749 3.5125749 1.0 0.9997954 1.0 0.9998116 0.1760479 0.3520958 251.2574850 251.2574850 0.3520958
7 0.1500170 0.9997479 3.5125749 3.5125749 1.0 0.9997668 1.0 0.9997968 0.1748503 0.5269461 251.2574850 251.2574850 0.5269461
8 0.2001364 0.9994625 3.5125749 3.5125749 1.0 0.9996776 1.0 0.9997669 0.1760479 0.7029940 251.2574850 251.2574850 0.7029940
9 0.3000341 0.0338779 2.9731009 3.3329545 0.8464164 0.8489891 0.9488636 0.9495647 0.2970060 1.0 197.3100897 233.2954545 0.9785510
10 0.3999318 0.0028213 0.0 2.5004263 0.0 0.0102039 0.7118500 0.7149247 0.0 1.0 -100.0 150.0426257 0.8388942
11 0.5001705 0.0011241 0.0 1.9993183 0.0 0.0016506 0.5691888 0.5719782 0.0 1.0 -100.0 99.9318337 0.6987607
12 0.6000682 0.0006141 0.0 1.6664773 0.0 0.0008402 0.4744318 0.4768967 0.0 1.0 -100.0 66.6477273 0.5591039
13 0.6999659 0.0003816 0.0 1.4286410 0.0 0.0004884 0.4067219 0.4089047 0.0 1.0 -100.0 42.8641013 0.4194471
14 0.7998636 0.0002528 0.0 1.2502131 0.0 0.0003136 0.3559250 0.3578743 0.0 1.0 -100.0 25.0213129 0.2797903
15 0.8997613 0.0001431 0.0 1.1114058 0.0 0.0001954 0.3164077 0.3181623 0.0 1.0 -100.0 11.1405836 0.1401335
16 1.0 0.0000548 0.0 1.0 0.0 0.0001039 0.2846914 0.2862806 0.0 1.0 -100.0 0.0 0.0
ModelMetricsBinomialGLM: stackedensemble
** Reported on cross-validation data. **

MSE: 0.011684883939393473
RMSE: 0.10809664166565709
LogLoss: 0.04218696916105611
AUC: 0.9979652706027412
AUCPR: 0.9961374399921781
Gini: 0.9959305412054824
Null degrees of freedom: 2932
Residual degrees of freedom: 2924
Null deviance: 3505.305034144737
Residual deviance: 247.46876109875512
AIC: 265.4687610987551
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.7174627292549592
0 1 Error Rate
0 2095.0 3.0 0.0014 (3.0/2098.0)
1 42.0 793.0 0.0503 (42.0/835.0)
Total 2137.0 796.0 0.0153 (45.0/2933.0)
Maximum Metrics: Maximum metrics at their respective thresholds
metric threshold value idx
max f1 0.7174627 0.9724096 112.0
max f2 0.1468996 0.9784156 189.0
max f0point5 0.7174627 0.9865638 112.0
max accuracy 0.7174627 0.9846573 112.0
max precision 0.9999199 1.0 0.0
max recall 0.0028657 1.0 365.0
max specificity 0.9999199 1.0 0.0
max absolute_mcc 0.7174627 0.9623200 112.0
max min_per_class_accuracy 0.2379595 0.9808383 169.0
max mean_per_class_accuracy 0.1642521 0.9819737 185.0
max tns 0.9999199 2098.0 0.0
max fns 0.9999199 780.0 0.0
max fps 0.0000287 2098.0 399.0
max tps 0.0028657 835.0 365.0
max tnr 0.9999199 1.0 0.0
max fnr 0.9999199 0.9341317 0.0
max fpr 0.0000287 1.0 399.0
max tpr 0.0028657 1.0 365.0
Gains/Lift Table: Avg response rate: 28.47 %, avg score: 28.48 %
group cumulative_data_fraction lower_threshold lift cumulative_lift response_rate score cumulative_response_rate cumulative_score capture_rate cumulative_capture_rate gain cumulative_gain kolmogorov_smirnov
1 0.0102284 0.9999180 3.5125749 3.5125749 1.0 0.9999352 1.0 0.9999352 0.0359281 0.0359281 251.2574850 251.2574850 0.0359281
2 0.0201159 0.9998868 3.5125749 3.5125749 1.0 0.9999013 1.0 0.9999185 0.0347305 0.0706587 251.2574850 251.2574850 0.0706587
3 0.0300034 0.9998575 3.5125749 3.5125749 1.0 0.9998702 1.0 0.9999026 0.0347305 0.1053892 251.2574850 251.2574850 0.1053892
4 0.0402318 0.9998225 3.5125749 3.5125749 1.0 0.9998415 1.0 0.9998871 0.0359281 0.1413174 251.2574850 251.2574850 0.1413174
5 0.0501193 0.9997939 3.5125749 3.5125749 1.0 0.9998088 1.0 0.9998716 0.0347305 0.1760479 251.2574850 251.2574850 0.1760479
6 0.1002387 0.9993996 3.5125749 3.5125749 1.0 0.9996169 1.0 0.9997443 0.1760479 0.3520958 251.2574850 251.2574850 0.3520958
7 0.1500170 0.9989825 3.5125749 3.5125749 1.0 0.9992138 1.0 0.9995683 0.1748503 0.5269461 251.2574850 251.2574850 0.5269461
8 0.2001364 0.9967890 3.5125749 3.5125749 1.0 0.9981442 1.0 0.9992116 0.1760479 0.7029940 251.2574850 251.2574850 0.7029940
9 0.3000341 0.1359474 2.8532178 3.2930389 0.8122867 0.7999760 0.9375 0.9328752 0.2850299 0.9880240 185.3217796 229.3038922 0.9618085
10 0.3999318 0.0113840 0.0719299 2.4884482 0.0204778 0.0379940 0.7084399 0.7093456 0.0071856 0.9952096 -92.8070140 148.8448168 0.8321972
11 0.5001705 0.0037902 0.0358426 1.9969239 0.0102041 0.0065884 0.5685072 0.5685068 0.0035928 0.9988024 -96.4157399 99.6923944 0.6970865
12 0.6000682 0.0016244 0.0119883 1.6664773 0.0034130 0.0025423 0.4744318 0.4742865 0.0011976 1.0 -98.8011690 66.6477273 0.5591039
13 0.6999659 0.0008044 0.0 1.4286410 0.0 0.0011552 0.4067219 0.4067622 0.0 1.0 -100.0 42.8641013 0.4194471
14 0.7998636 0.0003900 0.0 1.2502131 0.0 0.0005843 0.3559250 0.3560332 0.0 1.0 -100.0 25.0213129 0.2797903
15 0.8997613 0.0001276 0.0 1.1114058 0.0 0.0002462 0.3164077 0.3165313 0.0 1.0 -100.0 11.1405836 0.1401335
16 1.0 6.99e-06 0.0 1.0 0.0 0.0000593 0.2846914 0.2848086 0.0 1.0 -100.0 0.0 0.0
Cross-Validation Metrics Summary:
mean sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid cv_5_valid
accuracy 0.9866936 0.0038640 0.9829351 0.9917898 0.9831366 0.9863014 0.9893048
aic 66.293755 12.12167 74.20859 56.05992 82.7434 64.225174 54.231678
auc 0.9980263 0.0006794 0.9978026 0.9976698 0.9974441 0.9980351 0.9991797
err 0.0133064 0.0038640 0.0170648 0.0082102 0.0168634 0.0136986 0.0106952
err_count 7.8 2.280351 10.0 5.0 10.0 8.0 6.0
f0point5 0.9826331 0.0084325 0.9843938 0.9799292 0.969697 0.9872979 0.9918478
f1 0.9764566 0.0065211 0.9704142 0.9851632 0.969697 0.9771429 0.9798658
f2 0.9704682 0.0122690 0.9568262 0.9904535 0.969697 0.9671946 0.9681698
lift_top_group 3.5199234 0.1771113 3.3872833 3.6467066 3.5939393 3.2808988 3.6907895
loglikelihood 0.0 0.0 0.0 0.0 0.0 0.0 0.0
--- --- --- --- --- --- --- ---
mean_per_class_error 0.0192730 0.0071964 0.0272222 0.0075189 0.0209926 0.0208944 0.0197368
mse 0.0117021 0.0034641 0.0139158 0.0070053 0.0160867 0.0108037 0.0106989
null_deviance 701.06104 25.972713 711.6207 715.99097 701.4596 719.9654 656.2684
pr_auc 0.9962468 0.0011169 0.9957380 0.9963841 0.9948334 0.9963907 0.997888
precision 0.9868586 0.0130284 0.9939394 0.9764706 0.969697 0.9941860 1.0
r2 0.9425337 0.0169548 0.9331183 0.9648018 0.9198968 0.9490137 0.9458376
recall 0.9665772 0.0171743 0.9479769 0.994012 0.969697 0.9606742 0.9605263
residual_deviance 49.49375 11.132932 56.208588 38.05992 64.7434 48.225174 40.231678
rmse 0.1071745 0.0164205 0.1179653 0.0836975 0.1268335 0.1039410 0.1034354
specificity 0.9948767 0.0049765 0.9975787 0.9909502 0.9883177 0.9975370 1.0
[24 rows x 8 columns]

[tips]
Use `model.explain()` to inspect the model.
--
Use `h2o.display.toggle_user_tips()` to switch on/off this section.
In [64]:
import h2o
h2o.init(ip="localhost", port=54321, start_h2o=False)
from h2o.estimators import H2OXGBoostEstimator

xgb = H2OXGBoostEstimator(ntrees=100, max_depth=5, learn_rate=0.1, seed=42)
xgb.train(x=x, y=y, training_frame=train)
Warning: if you don't want to start local H2O server, then use of `h2o.connect()` is preferred.
Checking whether there is an H2O instance running at http://localhost:54321. connected.
Warning: Your H2O cluster version is (4 months and 21 days) old.  There may be a newer version available.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
H2O_cluster_uptime: 11 hours 55 mins
H2O_cluster_timezone: Europe/Moscow
H2O_data_parsing_timezone: UTC
H2O_cluster_version: 3.46.0.6
H2O_cluster_version_age: 4 months and 21 days
H2O_cluster_name: H2O_from_python_25150_lkcgsm
H2O_cluster_total_nodes: 1
H2O_cluster_free_memory: 7.278 Gb
H2O_cluster_total_cores: 32
H2O_cluster_allowed_cores: 32
H2O_cluster_status: locked, healthy
H2O_connection_url: http://localhost:54321
H2O_connection_proxy: {"http": null, "https": null}
H2O_internal_security: False
Python_version: 3.11.5 final
---------------------------------------------------------------------------
H2OResponseError                          Traceback (most recent call last)
Cell In[64], line 6
      3 from h2o.estimators import H2OXGBoostEstimator
      5 xgb = H2OXGBoostEstimator(ntrees=100, max_depth=5, learn_rate=0.1, seed=42)
----> 6 xgb.train(x=x, y=y, training_frame=train)

File D:\anaconda\Lib\site-packages\h2o\estimators\estimator_base.py:107, in H2OEstimator.train(self, x, y, training_frame, offset_column, fold_column, weights_column, validation_frame, max_runtime_secs, ignored_columns, model_id, verbose)
     88 """
     89 Train the H2O model.
     90 
   (...)
    101 :param bool verbose: Print scoring history to stdout. Defaults to False.
    102 """
    103 parms = self._make_parms(x=x, y=y, training_frame=training_frame, offset_column=offset_column, 
    104                          fold_column=fold_column, weights_column=weights_column, 
    105                          validation_frame=validation_frame, max_runtime_secs=max_runtime_secs, 
    106                          ignored_columns=ignored_columns, model_id=model_id, verbose=verbose)
--> 107 self._train(parms, verbose=verbose)
    108 return self

File D:\anaconda\Lib\site-packages\h2o\estimators\estimator_base.py:186, in H2OEstimator._train(self, parms, verbose)
    183 assert_is_type(verbose, bool)
    185 rest_ver = self._get_rest_version(parms)
--> 186 model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms)
    187 job = H2OJob(model_builder_json, job_type=(self.algo + " Model Build"))
    189 if model_builder_json["messages"] is not None:

File D:\anaconda\Lib\site-packages\h2o\h2o.py:123, in api(endpoint, data, json, filename, save_to)
    121 # type checks are performed in H2OConnection class
    122 _check_connection()
--> 123 return h2oconn.request(endpoint, data=data, json=json, filename=filename, save_to=save_to)

File D:\anaconda\Lib\site-packages\h2o\backend\connection.py:499, in H2OConnection.request(self, endpoint, data, json, filename, save_to)
    497         save_to = save_to(resp)
    498     self._log_end_transaction(start_time, resp)
--> 499     return self._process_response(resp, save_to)
    501 except (requests.exceptions.ConnectionError, requests.exceptions.HTTPError) as e:
    502     if self._local_server and not self._local_server.is_running():

File D:\anaconda\Lib\site-packages\h2o\backend\connection.py:853, in H2OConnection._process_response(response, save_to)
    851 if status_code in {400, 404, 412} and isinstance(data, H2OErrorV3):
    852     data.show_stacktrace = False
--> 853     raise H2OResponseError(data)
    855 # Server errors (notably 500 = "Server Error")
    856 # Note that it is possible to receive valid H2OErrorV3 object in this case, however it merely means the server
    857 # did not provide the correct status code.
    858 raise H2OServerError("HTTP %d %s:\n%s" % (status_code, response.reason, data))

H2OResponseError: Server error water.exceptions.H2ONotFoundArgumentException:
  Error: POST /3/ModelBuilders/xgboost not found
  Request: POST /3/ModelBuilders/xgboost
    data: {'training_frame': 'py_10_sid_baac', 'nfolds': '0', 'keep_cross_validation_models': 'True', 'keep_cross_validation_predictions': 'False', 'keep_cross_validation_fold_assignment': 'False', 'score_each_iteration': 'False', 'fold_assignment': 'auto', 'response_column': 'Healthy_Status', 'ignore_const_cols': 'True', 'stopping_rounds': '0', 'stopping_metric': 'auto', 'stopping_tolerance': '0.001', 'max_runtime_secs': '0.0', 'seed': '42', 'distribution': 'auto', 'tweedie_power': '1.5', 'categorical_encoding': 'auto', 'quiet_mode': 'True', 'ntrees': '100', 'max_depth': '5', 'min_rows': '1.0', 'min_child_weight': '1.0', 'learn_rate': '0.1', 'eta': '0.3', 'sample_rate': '1.0', 'subsample': '1.0', 'col_sample_rate': '1.0', 'colsample_bylevel': '1.0', 'col_sample_rate_per_tree': '1.0', 'colsample_bytree': '1.0', 'colsample_bynode': '1.0', 'max_abs_leafnode_pred': '0.0', 'max_delta_step': '0.0', 'score_tree_interval': '0', 'min_split_improvement': '0.0', 'gamma': '0.0', 'nthread': '-1', 'build_tree_one_node': 'False', 'parallelize_cross_validation': 'True', 'calibrate_model': 'False', 'calibration_method': 'auto', 'max_bins': '256', 'max_leaves': '0', 'sample_type': 'uniform', 'normalize_type': 'tree', 'rate_drop': '0.0', 'one_drop': 'False', 'skip_drop': '0.0', 'tree_method': 'auto', 'grow_policy': 'depthwise', 'booster': 'gbtree', 'reg_lambda': '1.0', 'reg_alpha': '0.0', 'dmatrix_type': 'auto', 'backend': 'auto', 'gainslift_bins': '-1', 'auc_type': 'auto', 'scale_pos_weight': '1.0', 'score_eval_metric_only': 'False'}
In [65]:
cd path\to\your\h2o\folder
java -Xmx4g -jar h2o.jar
  Cell In[65], line 1
    cd path\to\your\h2o\folder
            ^
SyntaxError: unexpected character after line continuation character
In [66]:
import h2o
h2o.init(ip="localhost", port=54323)
Checking whether there is an H2O instance running at http://localhost:54323. connected.
Warning: Your H2O cluster version is (4 months and 21 days) old.  There may be a newer version available.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
H2O_cluster_uptime: 51 mins 21 secs
H2O_cluster_timezone: Europe/Moscow
H2O_data_parsing_timezone: UTC
H2O_cluster_version: 3.46.0.6
H2O_cluster_version_age: 4 months and 21 days
H2O_cluster_name: 25150
H2O_cluster_total_nodes: 1
H2O_cluster_free_memory: 4 Gb
H2O_cluster_total_cores: 32
H2O_cluster_allowed_cores: 32
H2O_cluster_status: locked, healthy
H2O_connection_url: http://localhost:54323
H2O_connection_proxy: {"http": null, "https": null}
H2O_internal_security: False
Python_version: 3.11.5 final
In [ ]: